1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2004, 2007 Lukas Ertl
5 * Copyright (c) 2007, 2009 Ulf Lilleengen
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/lock.h>
33#include <sys/malloc.h>
34#include <sys/systm.h>
35
36#include <geom/geom.h>
37#include <geom/geom_dbg.h>
38#include <geom/vinum/geom_vinum_var.h>
39#include <geom/vinum/geom_vinum_raid5.h>
40#include <geom/vinum/geom_vinum.h>
41
42static int	gv_check_parity(struct gv_plex *, struct bio *,
43		    struct gv_raid5_packet *);
44static int	gv_normal_parity(struct gv_plex *, struct bio *,
45		    struct gv_raid5_packet *);
46static void	gv_plex_flush(struct gv_plex *);
47static int	gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
48		    int *, int);
49static int 	gv_plex_normal_request(struct gv_plex *, struct bio *, off_t,
50		    off_t,  caddr_t);
51static void	gv_post_bio(struct gv_softc *, struct bio *);
52
53void
54gv_plex_start(struct gv_plex *p, struct bio *bp)
55{
56	struct bio *cbp;
57	struct gv_sd *s;
58	struct gv_raid5_packet *wp;
59	caddr_t addr;
60	off_t bcount, boff, len;
61
62	bcount = bp->bio_length;
63	addr = bp->bio_data;
64	boff = bp->bio_offset;
65
66	/* Walk over the whole length of the request, we might split it up. */
67	while (bcount > 0) {
68		wp = NULL;
69
70 		/*
71		 * RAID5 plexes need special treatment, as a single request
72		 * might involve several read/write sub-requests.
73 		 */
74		if (p->org == GV_PLEX_RAID5) {
75			wp = gv_raid5_start(p, bp, addr, boff, bcount);
76 			if (wp == NULL)
77 				return;
78
79			len = wp->length;
80
81			if (TAILQ_EMPTY(&wp->bits))
82				g_free(wp);
83			else if (wp->lockbase != -1)
84				TAILQ_INSERT_TAIL(&p->packets, wp, list);
85
86		/*
87		 * Requests to concatenated and striped plexes go straight
88		 * through.
89		 */
90		} else {
91			len = gv_plex_normal_request(p, bp, boff, bcount, addr);
92		}
93		if (len < 0)
94			return;
95
96		bcount -= len;
97		addr += len;
98		boff += len;
99	}
100
101	/*
102	 * Fire off all sub-requests.  We get the correct consumer (== drive)
103	 * to send each request to via the subdisk that was stored in
104	 * cbp->bio_caller1.
105	 */
106	cbp = bioq_takefirst(p->bqueue);
107	while (cbp != NULL) {
108		/*
109		 * RAID5 sub-requests need to come in correct order, otherwise
110		 * we trip over the parity, as it might be overwritten by
111		 * another sub-request.  We abuse cbp->bio_caller2 to mark
112		 * potential overlap situations.
113		 */
114		if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) {
115			/* Park the bio on the waiting queue. */
116			cbp->bio_pflags |= GV_BIO_ONHOLD;
117			bioq_disksort(p->wqueue, cbp);
118		} else {
119			s = cbp->bio_caller1;
120			g_io_request(cbp, s->drive_sc->consumer);
121		}
122		cbp = bioq_takefirst(p->bqueue);
123	}
124}
125
126static int
127gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
128    off_t *real_len, int *sdno, int growing)
129{
130	struct gv_sd *s;
131	int i, sdcount;
132	off_t len_left, stripeend, stripeno, stripestart;
133
134	switch (p->org) {
135	case GV_PLEX_CONCAT:
136		/*
137		 * Find the subdisk where this request starts.  The subdisks in
138		 * this list must be ordered by plex_offset.
139		 */
140		i = 0;
141		LIST_FOREACH(s, &p->subdisks, in_plex) {
142			if (s->plex_offset <= boff &&
143			    s->plex_offset + s->size > boff) {
144				*sdno = i;
145				break;
146			}
147			i++;
148		}
149		if (s == NULL || s->drive_sc == NULL)
150			return (GV_ERR_NOTFOUND);
151
152		/* Calculate corresponding offsets on disk. */
153		*real_off = boff - s->plex_offset;
154		len_left = s->size - (*real_off);
155		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
156		*real_len = (bcount > len_left) ? len_left : bcount;
157		break;
158
159	case GV_PLEX_STRIPED:
160		/* The number of the stripe where the request starts. */
161		stripeno = boff / p->stripesize;
162		KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0"));
163
164		/* Take growing subdisks into account when calculating. */
165		sdcount = gv_sdcount(p, (boff >= p->synced));
166
167		if (!(boff + bcount <= p->synced) &&
168		    (p->flags & GV_PLEX_GROWING) &&
169		    !growing)
170			return (GV_ERR_ISBUSY);
171		*sdno = stripeno % sdcount;
172
173		KASSERT(*sdno >= 0, ("gv_plex_offset: sdno < 0"));
174		stripestart = (stripeno / sdcount) *
175		    p->stripesize;
176		KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0"));
177		stripeend = stripestart + p->stripesize;
178		*real_off = boff - (stripeno * p->stripesize) +
179		    stripestart;
180		len_left = stripeend - *real_off;
181		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
182
183		*real_len = (bcount <= len_left) ? bcount : len_left;
184		break;
185
186	default:
187		return (GV_ERR_PLEXORG);
188	}
189	return (0);
190}
191
192/*
193 * Prepare a normal plex request.
194 */
195static int
196gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff,
197    off_t bcount,  caddr_t addr)
198{
199	struct gv_sd *s;
200	struct bio *cbp;
201	off_t real_len, real_off;
202	int i, err, sdno;
203
204	s = NULL;
205	sdno = -1;
206	real_len = real_off = 0;
207
208	err = ENXIO;
209
210	if (p == NULL || LIST_EMPTY(&p->subdisks))
211		goto bad;
212
213	err = gv_plex_offset(p, boff, bcount, &real_off,
214	    &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW));
215	/* If the request was blocked, put it into wait. */
216	if (err == GV_ERR_ISBUSY) {
217		bioq_disksort(p->rqueue, bp);
218		return (-1); /* "Fail", and delay request. */
219	}
220	if (err) {
221		err = ENXIO;
222		goto bad;
223	}
224	err = ENXIO;
225
226	/* Find the right subdisk. */
227	i = 0;
228	LIST_FOREACH(s, &p->subdisks, in_plex) {
229		if (i == sdno)
230			break;
231		i++;
232	}
233
234	/* Subdisk not found. */
235	if (s == NULL || s->drive_sc == NULL)
236		goto bad;
237
238	/* Now check if we can handle the request on this subdisk. */
239	switch (s->state) {
240	case GV_SD_UP:
241		/* If the subdisk is up, just continue. */
242		break;
243	case GV_SD_DOWN:
244		if (bp->bio_pflags & GV_BIO_INTERNAL)
245			G_VINUM_DEBUG(0, "subdisk must be in the stale state in"
246			    " order to perform administrative requests");
247		goto bad;
248	case GV_SD_STALE:
249		if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) {
250			G_VINUM_DEBUG(0, "subdisk stale, unable to perform "
251			    "regular requests");
252			goto bad;
253		}
254
255		G_VINUM_DEBUG(1, "sd %s is initializing", s->name);
256		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
257		break;
258	case GV_SD_INITIALIZING:
259		if (bp->bio_cmd == BIO_READ)
260			goto bad;
261		break;
262	default:
263		/* All other subdisk states mean it's not accessible. */
264		goto bad;
265	}
266
267	/* Clone the bio and adjust the offsets and sizes. */
268	cbp = g_clone_bio(bp);
269	if (cbp == NULL) {
270		err = ENOMEM;
271		goto bad;
272	}
273	cbp->bio_offset = real_off + s->drive_offset;
274	cbp->bio_length = real_len;
275	cbp->bio_data = addr;
276	cbp->bio_done = gv_done;
277	cbp->bio_caller1 = s;
278	s->drive_sc->active++;
279
280	/* Store the sub-requests now and let others issue them. */
281	bioq_insert_tail(p->bqueue, cbp);
282	return (real_len);
283bad:
284	G_VINUM_LOGREQ(0, bp, "plex request failed.");
285	/* Building the sub-request failed. If internal BIO, do not deliver. */
286	if (bp->bio_pflags & GV_BIO_INTERNAL) {
287		if (bp->bio_pflags & GV_BIO_MALLOC)
288			g_free(bp->bio_data);
289		g_destroy_bio(bp);
290		p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
291		    GV_PLEX_GROWING);
292		return (-1);
293	}
294	g_io_deliver(bp, err);
295	return (-1);
296}
297
298/*
299 * Handle a completed request to a striped or concatenated plex.
300 */
301void
302gv_plex_normal_done(struct gv_plex *p, struct bio *bp)
303{
304	struct bio *pbp;
305
306	pbp = bp->bio_parent;
307	if (pbp->bio_error == 0)
308		pbp->bio_error = bp->bio_error;
309	g_destroy_bio(bp);
310	pbp->bio_inbed++;
311	if (pbp->bio_children == pbp->bio_inbed) {
312		/* Just set it to length since multiple plexes will
313		 * screw things up. */
314		pbp->bio_completed = pbp->bio_length;
315		if (pbp->bio_pflags & GV_BIO_SYNCREQ)
316			gv_sync_complete(p, pbp);
317		else if (pbp->bio_pflags & GV_BIO_GROW)
318			gv_grow_complete(p, pbp);
319		else
320			g_io_deliver(pbp, pbp->bio_error);
321	}
322}
323
324/*
325 * Handle a completed request to a RAID-5 plex.
326 */
327void
328gv_plex_raid5_done(struct gv_plex *p, struct bio *bp)
329{
330	struct gv_softc *sc;
331	struct bio *cbp, *pbp;
332	struct gv_bioq *bq, *bq2;
333	struct gv_raid5_packet *wp;
334	off_t completed;
335	int i;
336
337	completed = 0;
338	sc = p->vinumconf;
339	wp = bp->bio_caller2;
340
341	switch (bp->bio_parent->bio_cmd) {
342	case BIO_READ:
343		if (wp == NULL) {
344			completed = bp->bio_completed;
345			break;
346		}
347
348		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
349			if (bq->bp != bp)
350				continue;
351			TAILQ_REMOVE(&wp->bits, bq, queue);
352			g_free(bq);
353			for (i = 0; i < wp->length; i++)
354				wp->data[i] ^= bp->bio_data[i];
355			break;
356		}
357		if (TAILQ_EMPTY(&wp->bits)) {
358			completed = wp->length;
359			if (wp->lockbase != -1) {
360				TAILQ_REMOVE(&p->packets, wp, list);
361				/* Bring the waiting bios back into the game. */
362				pbp = bioq_takefirst(p->wqueue);
363				while (pbp != NULL) {
364					gv_post_bio(sc, pbp);
365					pbp = bioq_takefirst(p->wqueue);
366				}
367			}
368			g_free(wp);
369		}
370
371		break;
372
373 	case BIO_WRITE:
374		/* XXX can this ever happen? */
375		if (wp == NULL) {
376			completed = bp->bio_completed;
377			break;
378		}
379
380		/* Check if we need to handle parity data. */
381		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
382			if (bq->bp != bp)
383				continue;
384			TAILQ_REMOVE(&wp->bits, bq, queue);
385			g_free(bq);
386			cbp = wp->parity;
387			if (cbp != NULL) {
388				for (i = 0; i < wp->length; i++)
389					cbp->bio_data[i] ^= bp->bio_data[i];
390			}
391			break;
392		}
393
394		/* Handle parity data. */
395		if (TAILQ_EMPTY(&wp->bits)) {
396			if (bp->bio_parent->bio_pflags & GV_BIO_CHECK)
397				i = gv_check_parity(p, bp, wp);
398			else
399				i = gv_normal_parity(p, bp, wp);
400
401			/* All of our sub-requests have finished. */
402			if (i) {
403				completed = wp->length;
404				TAILQ_REMOVE(&p->packets, wp, list);
405				/* Bring the waiting bios back into the game. */
406				pbp = bioq_takefirst(p->wqueue);
407				while (pbp != NULL) {
408					gv_post_bio(sc, pbp);
409					pbp = bioq_takefirst(p->wqueue);
410				}
411				g_free(wp);
412			}
413		}
414
415		break;
416	}
417
418	pbp = bp->bio_parent;
419	if (pbp->bio_error == 0)
420		pbp->bio_error = bp->bio_error;
421	pbp->bio_completed += completed;
422
423	/* When the original request is finished, we deliver it. */
424	pbp->bio_inbed++;
425	if (pbp->bio_inbed == pbp->bio_children) {
426		/* Hand it over for checking or delivery. */
427		if (pbp->bio_cmd == BIO_WRITE &&
428		    (pbp->bio_pflags & GV_BIO_CHECK)) {
429			gv_parity_complete(p, pbp);
430		} else if (pbp->bio_cmd == BIO_WRITE &&
431		    (pbp->bio_pflags & GV_BIO_REBUILD)) {
432			gv_rebuild_complete(p, pbp);
433		} else if (pbp->bio_pflags & GV_BIO_INIT) {
434			gv_init_complete(p, pbp);
435		} else if (pbp->bio_pflags & GV_BIO_SYNCREQ) {
436			gv_sync_complete(p, pbp);
437		} else if (pbp->bio_pflags & GV_BIO_GROW) {
438			gv_grow_complete(p, pbp);
439		} else {
440			g_io_deliver(pbp, pbp->bio_error);
441		}
442	}
443
444	/* Clean up what we allocated. */
445	if (bp->bio_cflags & GV_BIO_MALLOC)
446		g_free(bp->bio_data);
447	g_destroy_bio(bp);
448}
449
450static int
451gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
452{
453	struct bio *pbp;
454	struct gv_sd *s;
455	int err, finished, i;
456
457	err = 0;
458	finished = 1;
459
460	if (wp->waiting != NULL) {
461		pbp = wp->waiting;
462		wp->waiting = NULL;
463		s = pbp->bio_caller1;
464		g_io_request(pbp, s->drive_sc->consumer);
465		finished = 0;
466
467	} else if (wp->parity != NULL) {
468		pbp = wp->parity;
469		wp->parity = NULL;
470
471		/* Check if the parity is correct. */
472		for (i = 0; i < wp->length; i++) {
473			if (bp->bio_data[i] != pbp->bio_data[i]) {
474				err = 1;
475				break;
476			}
477		}
478
479		/* The parity is not correct... */
480		if (err) {
481			bp->bio_parent->bio_error = EAGAIN;
482
483			/* ... but we rebuild it. */
484			if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) {
485				s = pbp->bio_caller1;
486				g_io_request(pbp, s->drive_sc->consumer);
487				finished = 0;
488			}
489		}
490
491		/*
492		 * Clean up the BIO we would have used for rebuilding the
493		 * parity.
494		 */
495		if (finished) {
496			bp->bio_parent->bio_inbed++;
497			g_destroy_bio(pbp);
498		}
499	}
500
501	return (finished);
502}
503
504static int
505gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
506{
507	struct bio *cbp, *pbp;
508	struct gv_sd *s;
509	int finished, i;
510
511	finished = 1;
512
513	if (wp->waiting != NULL) {
514		pbp = wp->waiting;
515		wp->waiting = NULL;
516		cbp = wp->parity;
517		for (i = 0; i < wp->length; i++)
518			cbp->bio_data[i] ^= pbp->bio_data[i];
519		s = pbp->bio_caller1;
520		g_io_request(pbp, s->drive_sc->consumer);
521		finished = 0;
522
523	} else if (wp->parity != NULL) {
524		cbp = wp->parity;
525		wp->parity = NULL;
526		s = cbp->bio_caller1;
527		g_io_request(cbp, s->drive_sc->consumer);
528		finished = 0;
529	}
530
531	return (finished);
532}
533
534/* Flush the queue with delayed requests. */
535static void
536gv_plex_flush(struct gv_plex *p)
537{
538	struct bio *bp;
539
540	bp = bioq_takefirst(p->rqueue);
541	while (bp != NULL) {
542		gv_plex_start(p, bp);
543		bp = bioq_takefirst(p->rqueue);
544	}
545}
546
547static void
548gv_post_bio(struct gv_softc *sc, struct bio *bp)
549{
550
551	KASSERT(sc != NULL, ("NULL sc"));
552	KASSERT(bp != NULL, ("NULL bp"));
553	mtx_lock(&sc->bqueue_mtx);
554	bioq_disksort(sc->bqueue_down, bp);
555	wakeup(sc);
556	mtx_unlock(&sc->bqueue_mtx);
557}
558
559int
560gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset,
561    off_t length, int type, caddr_t data)
562{
563	struct gv_softc *sc;
564	struct bio *bp;
565
566	KASSERT(from != NULL, ("NULL from"));
567	KASSERT(to != NULL, ("NULL to"));
568	sc = from->vinumconf;
569	KASSERT(sc != NULL, ("NULL sc"));
570
571	bp = g_new_bio();
572	if (bp == NULL) {
573		G_VINUM_DEBUG(0, "sync from '%s' failed at offset "
574		    " %jd; out of memory", from->name, offset);
575		return (ENOMEM);
576	}
577	bp->bio_length = length;
578	bp->bio_done = NULL;
579	bp->bio_pflags |= GV_BIO_SYNCREQ;
580	bp->bio_offset = offset;
581	bp->bio_caller1 = from;
582	bp->bio_caller2 = to;
583	bp->bio_cmd = type;
584	if (data == NULL)
585		data = g_malloc(length, M_WAITOK);
586	bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */
587	bp->bio_data = data;
588
589	/* Send down next. */
590	gv_post_bio(sc, bp);
591	//gv_plex_start(from, bp);
592	return (0);
593}
594
595/*
596 * Handle a finished plex sync bio.
597 */
598int
599gv_sync_complete(struct gv_plex *to, struct bio *bp)
600{
601	struct gv_plex *from, *p;
602	struct gv_sd *s;
603	struct gv_volume *v;
604	struct gv_softc *sc;
605	off_t offset;
606	int err;
607
608	g_topology_assert_not();
609
610	err = 0;
611	KASSERT(to != NULL, ("NULL to"));
612	KASSERT(bp != NULL, ("NULL bp"));
613	from = bp->bio_caller2;
614	KASSERT(from != NULL, ("NULL from"));
615	v = to->vol_sc;
616	KASSERT(v != NULL, ("NULL v"));
617	sc = v->vinumconf;
618	KASSERT(sc != NULL, ("NULL sc"));
619
620	/* If it was a read, write it. */
621	if (bp->bio_cmd == BIO_READ) {
622		err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length,
623	    	    BIO_WRITE, bp->bio_data);
624	/* If it was a write, read the next one. */
625	} else if (bp->bio_cmd == BIO_WRITE) {
626		if (bp->bio_pflags & GV_BIO_MALLOC)
627			g_free(bp->bio_data);
628		to->synced += bp->bio_length;
629		/* If we're finished, clean up. */
630		if (bp->bio_offset + bp->bio_length >= from->size) {
631			G_VINUM_DEBUG(1, "syncing of %s from %s completed",
632			    to->name, from->name);
633			/* Update our state. */
634			LIST_FOREACH(s, &to->subdisks, in_plex)
635				gv_set_sd_state(s, GV_SD_UP, 0);
636			gv_update_plex_state(to);
637			to->flags &= ~GV_PLEX_SYNCING;
638			to->synced = 0;
639			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
640		} else {
641			offset = bp->bio_offset + bp->bio_length;
642			err = gv_sync_request(from, to, offset,
643			    MIN(bp->bio_length, from->size - offset),
644			    BIO_READ, NULL);
645		}
646	}
647	g_destroy_bio(bp);
648	/* Clean up if there was an error. */
649	if (err) {
650		to->flags &= ~GV_PLEX_SYNCING;
651		G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err);
652	}
653
654	/* Check if all plexes are synced, and lower refcounts. */
655	g_topology_lock();
656	LIST_FOREACH(p, &v->plexes, in_volume) {
657		if (p->flags & GV_PLEX_SYNCING) {
658			g_topology_unlock();
659			return (-1);
660		}
661	}
662	/* If we came here, all plexes are synced, and we're free. */
663	gv_access(v->provider, -1, -1, 0);
664	g_topology_unlock();
665	G_VINUM_DEBUG(1, "plex sync completed");
666	gv_volume_flush(v);
667	return (0);
668}
669
670/*
671 * Create a new bio struct for the next grow request.
672 */
673int
674gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type,
675    caddr_t data)
676{
677	struct gv_softc *sc;
678	struct bio *bp;
679
680	KASSERT(p != NULL, ("gv_grow_request: NULL p"));
681	sc = p->vinumconf;
682	KASSERT(sc != NULL, ("gv_grow_request: NULL sc"));
683
684	bp = g_new_bio();
685	if (bp == NULL) {
686		G_VINUM_DEBUG(0, "grow of %s failed creating bio: "
687		    "out of memory", p->name);
688		return (ENOMEM);
689	}
690
691	bp->bio_cmd = type;
692	bp->bio_done = NULL;
693	bp->bio_error = 0;
694	bp->bio_caller1 = p;
695	bp->bio_offset = offset;
696	bp->bio_length = length;
697	bp->bio_pflags |= GV_BIO_GROW;
698	if (data == NULL)
699		data = g_malloc(length, M_WAITOK);
700	bp->bio_pflags |= GV_BIO_MALLOC;
701	bp->bio_data = data;
702
703	gv_post_bio(sc, bp);
704	//gv_plex_start(p, bp);
705	return (0);
706}
707
708/*
709 * Finish handling of a bio to a growing plex.
710 */
711void
712gv_grow_complete(struct gv_plex *p, struct bio *bp)
713{
714	struct gv_softc *sc;
715	struct gv_sd *s;
716	struct gv_volume *v;
717	off_t origsize, offset;
718	int sdcount, err;
719
720	v = p->vol_sc;
721	KASSERT(v != NULL, ("gv_grow_complete: NULL v"));
722	sc = v->vinumconf;
723	KASSERT(sc != NULL, ("gv_grow_complete: NULL sc"));
724	err = 0;
725
726	/* If it was a read, write it. */
727	if (bp->bio_cmd == BIO_READ) {
728		p->synced += bp->bio_length;
729		err = gv_grow_request(p, bp->bio_offset, bp->bio_length,
730		    BIO_WRITE, bp->bio_data);
731	/* If it was a write, read next. */
732	} else if (bp->bio_cmd == BIO_WRITE) {
733		if (bp->bio_pflags & GV_BIO_MALLOC)
734			g_free(bp->bio_data);
735
736		/* Find the real size of the plex. */
737		sdcount = gv_sdcount(p, 1);
738		s = LIST_FIRST(&p->subdisks);
739		KASSERT(s != NULL, ("NULL s"));
740		origsize = (s->size * (sdcount - 1));
741		if (bp->bio_offset + bp->bio_length >= origsize) {
742			G_VINUM_DEBUG(1, "growing of %s completed", p->name);
743			p->flags &= ~GV_PLEX_GROWING;
744			LIST_FOREACH(s, &p->subdisks, in_plex) {
745				s->flags &= ~GV_SD_GROW;
746				gv_set_sd_state(s, GV_SD_UP, 0);
747			}
748			p->size = gv_plex_size(p);
749			gv_update_vol_size(v, gv_vol_size(v));
750			gv_set_plex_state(p, GV_PLEX_UP, 0);
751			g_topology_lock();
752			gv_access(v->provider, -1, -1, 0);
753			g_topology_unlock();
754			p->synced = 0;
755			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
756			/* Issue delayed requests. */
757			gv_plex_flush(p);
758		} else {
759			offset = bp->bio_offset + bp->bio_length;
760			err = gv_grow_request(p, offset,
761			   MIN(bp->bio_length, origsize - offset),
762			   BIO_READ, NULL);
763		}
764	}
765	g_destroy_bio(bp);
766
767	if (err) {
768		p->flags &= ~GV_PLEX_GROWING;
769		G_VINUM_DEBUG(0, "error growing plex: error code %d", err);
770	}
771}
772
773/*
774 * Create an initialization BIO and send it off to the consumer. Assume that
775 * we're given initialization data as parameter.
776 */
777void
778gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length)
779{
780	struct gv_drive *d;
781	struct g_consumer *cp;
782	struct bio *bp, *cbp;
783
784	KASSERT(s != NULL, ("gv_init_request: NULL s"));
785	d = s->drive_sc;
786	KASSERT(d != NULL, ("gv_init_request: NULL d"));
787	cp = d->consumer;
788	KASSERT(cp != NULL, ("gv_init_request: NULL cp"));
789
790	bp = g_new_bio();
791	if (bp == NULL) {
792		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
793		    " (drive offset %jd); out of memory", s->name,
794		    (intmax_t)s->initialized, (intmax_t)start);
795		return; /* XXX: Error codes. */
796	}
797	bp->bio_cmd = BIO_WRITE;
798	bp->bio_data = data;
799	bp->bio_done = NULL;
800	bp->bio_error = 0;
801	bp->bio_length = length;
802	bp->bio_pflags |= GV_BIO_INIT;
803	bp->bio_offset = start;
804	bp->bio_caller1 = s;
805
806	/* Then ofcourse, we have to clone it. */
807	cbp = g_clone_bio(bp);
808	if (cbp == NULL) {
809		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
810		    " (drive offset %jd); out of memory", s->name,
811		    (intmax_t)s->initialized, (intmax_t)start);
812		return; /* XXX: Error codes. */
813	}
814	cbp->bio_done = gv_done;
815	cbp->bio_caller1 = s;
816	d->active++;
817	/* Send it off to the consumer. */
818	g_io_request(cbp, cp);
819}
820
821/*
822 * Handle a finished initialization BIO.
823 */
824void
825gv_init_complete(struct gv_plex *p, struct bio *bp)
826{
827	struct gv_softc *sc;
828	struct gv_drive *d;
829	struct g_consumer *cp;
830	struct gv_sd *s;
831	off_t start, length;
832	caddr_t data;
833	int error;
834
835	s = bp->bio_caller1;
836	start = bp->bio_offset;
837	length = bp->bio_length;
838	error = bp->bio_error;
839	data = bp->bio_data;
840
841	KASSERT(s != NULL, ("gv_init_complete: NULL s"));
842	d = s->drive_sc;
843	KASSERT(d != NULL, ("gv_init_complete: NULL d"));
844	cp = d->consumer;
845	KASSERT(cp != NULL, ("gv_init_complete: NULL cp"));
846	sc = p->vinumconf;
847	KASSERT(sc != NULL, ("gv_init_complete: NULL sc"));
848
849	g_destroy_bio(bp);
850
851	/*
852	 * First we need to find out if it was okay, and abort if it's not.
853	 * Then we need to free previous buffers, find out the correct subdisk,
854	 * as well as getting the correct starting point and length of the BIO.
855	 */
856	if (start >= s->drive_offset + s->size) {
857		/* Free the data we initialized. */
858		g_free(data);
859		g_topology_assert_not();
860		g_topology_lock();
861		g_access(cp, 0, -1, 0);
862		g_topology_unlock();
863		if (error) {
864			gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE |
865			    GV_SETSTATE_CONFIG);
866		} else {
867			gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG);
868			s->initialized = 0;
869			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
870			G_VINUM_DEBUG(1, "subdisk '%s' init: finished "
871			    "successfully", s->name);
872		}
873		return;
874	}
875	s->initialized += length;
876	start += length;
877	gv_init_request(s, start, data, length);
878}
879
880/*
881 * Create a new bio struct for the next parity rebuild. Used both by internal
882 * rebuild of degraded plexes as well as user initiated rebuilds/checks.
883 */
884void
885gv_parity_request(struct gv_plex *p, int flags, off_t offset)
886{
887	struct gv_softc *sc;
888	struct bio *bp;
889
890	KASSERT(p != NULL, ("gv_parity_request: NULL p"));
891	sc = p->vinumconf;
892	KASSERT(sc != NULL, ("gv_parity_request: NULL sc"));
893
894	bp = g_new_bio();
895	if (bp == NULL) {
896		G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: "
897		    "out of memory", p->name);
898		return;
899	}
900
901	bp->bio_cmd = BIO_WRITE;
902	bp->bio_done = NULL;
903	bp->bio_error = 0;
904	bp->bio_length = p->stripesize;
905	bp->bio_caller1 = p;
906
907	/*
908	 * Check if it's a rebuild of a degraded plex or a user request of
909	 * parity rebuild.
910	 */
911	if (flags & GV_BIO_REBUILD)
912		bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK);
913	else if (flags & GV_BIO_CHECK)
914		bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO);
915	else {
916		G_VINUM_DEBUG(0, "invalid flags given in rebuild");
917		return;
918	}
919
920	bp->bio_pflags = flags;
921	bp->bio_pflags |= GV_BIO_MALLOC;
922
923	/* We still have more parity to build. */
924	bp->bio_offset = offset;
925	gv_post_bio(sc, bp);
926	//gv_plex_start(p, bp); /* Send it down to the plex. */
927}
928
929/*
930 * Handle a finished parity write.
931 */
932void
933gv_parity_complete(struct gv_plex *p, struct bio *bp)
934{
935	struct gv_softc *sc;
936	int error, flags;
937
938	error = bp->bio_error;
939	flags = bp->bio_pflags;
940	flags &= ~GV_BIO_MALLOC;
941
942	sc = p->vinumconf;
943	KASSERT(sc != NULL, ("gv_parity_complete: NULL sc"));
944
945	/* Clean up what we allocated. */
946	if (bp->bio_pflags & GV_BIO_MALLOC)
947		g_free(bp->bio_data);
948	g_destroy_bio(bp);
949
950	if (error == EAGAIN) {
951		G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx",
952		    (intmax_t)p->synced);
953	}
954
955	/* Any error is fatal, except EAGAIN when we're rebuilding. */
956	if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) {
957		/* Make sure we don't have the lock. */
958		g_topology_assert_not();
959		g_topology_lock();
960		gv_access(p->vol_sc->provider, -1, -1, 0);
961		g_topology_unlock();
962		G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx "
963		    "errno %d", p->name, (intmax_t)p->synced, error);
964		return;
965	} else {
966		p->synced += p->stripesize;
967	}
968
969	if (p->synced >= p->size) {
970		/* Make sure we don't have the lock. */
971		g_topology_assert_not();
972		g_topology_lock();
973		gv_access(p->vol_sc->provider, -1, -1, 0);
974		g_topology_unlock();
975		/* We're finished. */
976		G_VINUM_DEBUG(1, "parity operation on %s finished", p->name);
977		p->synced = 0;
978		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
979		return;
980	}
981
982	/* Send down next. It will determine if we need to itself. */
983	gv_parity_request(p, flags, p->synced);
984}
985
986/*
987 * Handle a finished plex rebuild bio.
988 */
989void
990gv_rebuild_complete(struct gv_plex *p, struct bio *bp)
991{
992	struct gv_softc *sc;
993	struct gv_sd *s;
994	int error, flags;
995	off_t offset;
996
997	error = bp->bio_error;
998	flags = bp->bio_pflags;
999	offset = bp->bio_offset;
1000	flags &= ~GV_BIO_MALLOC;
1001	sc = p->vinumconf;
1002	KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc"));
1003
1004	/* Clean up what we allocated. */
1005	if (bp->bio_pflags & GV_BIO_MALLOC)
1006		g_free(bp->bio_data);
1007	g_destroy_bio(bp);
1008
1009	if (error) {
1010		g_topology_assert_not();
1011		g_topology_lock();
1012		gv_access(p->vol_sc->provider, -1, -1, 0);
1013		g_topology_unlock();
1014
1015		G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d",
1016		    p->name, (intmax_t)offset, error);
1017		p->flags &= ~GV_PLEX_REBUILDING;
1018		p->synced = 0;
1019		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1020		return;
1021	}
1022
1023	offset += (p->stripesize * (gv_sdcount(p, 1) - 1));
1024	if (offset >= p->size) {
1025		/* We're finished. */
1026		g_topology_assert_not();
1027		g_topology_lock();
1028		gv_access(p->vol_sc->provider, -1, -1, 0);
1029		g_topology_unlock();
1030
1031		G_VINUM_DEBUG(1, "rebuild of %s finished", p->name);
1032		gv_save_config(p->vinumconf);
1033		p->flags &= ~GV_PLEX_REBUILDING;
1034		p->synced = 0;
1035		/* Try to up all subdisks. */
1036		LIST_FOREACH(s, &p->subdisks, in_plex)
1037			gv_update_sd_state(s);
1038		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
1039		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1040		return;
1041	}
1042
1043	/* Send down next. It will determine if we need to itself. */
1044	gv_parity_request(p, flags, offset);
1045}
1046