1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2004, 2007 Lukas Ertl
5 * Copyright (c) 2007, 2009 Ulf Lilleengen
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD$");
32
33#include <sys/param.h>
34#include <sys/bio.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/systm.h>
38
39#include <geom/geom.h>
40#include <geom/geom_dbg.h>
41#include <geom/vinum/geom_vinum_var.h>
42#include <geom/vinum/geom_vinum_raid5.h>
43#include <geom/vinum/geom_vinum.h>
44
45static int	gv_check_parity(struct gv_plex *, struct bio *,
46		    struct gv_raid5_packet *);
47static int	gv_normal_parity(struct gv_plex *, struct bio *,
48		    struct gv_raid5_packet *);
49static void	gv_plex_flush(struct gv_plex *);
50static int	gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
51		    int *, int);
52static int 	gv_plex_normal_request(struct gv_plex *, struct bio *, off_t,
53		    off_t,  caddr_t);
54static void	gv_post_bio(struct gv_softc *, struct bio *);
55
56void
57gv_plex_start(struct gv_plex *p, struct bio *bp)
58{
59	struct bio *cbp;
60	struct gv_sd *s;
61	struct gv_raid5_packet *wp;
62	caddr_t addr;
63	off_t bcount, boff, len;
64
65	bcount = bp->bio_length;
66	addr = bp->bio_data;
67	boff = bp->bio_offset;
68
69	/* Walk over the whole length of the request, we might split it up. */
70	while (bcount > 0) {
71		wp = NULL;
72
73 		/*
74		 * RAID5 plexes need special treatment, as a single request
75		 * might involve several read/write sub-requests.
76 		 */
77		if (p->org == GV_PLEX_RAID5) {
78			wp = gv_raid5_start(p, bp, addr, boff, bcount);
79 			if (wp == NULL)
80 				return;
81
82			len = wp->length;
83
84			if (TAILQ_EMPTY(&wp->bits))
85				g_free(wp);
86			else if (wp->lockbase != -1)
87				TAILQ_INSERT_TAIL(&p->packets, wp, list);
88
89		/*
90		 * Requests to concatenated and striped plexes go straight
91		 * through.
92		 */
93		} else {
94			len = gv_plex_normal_request(p, bp, boff, bcount, addr);
95		}
96		if (len < 0)
97			return;
98
99		bcount -= len;
100		addr += len;
101		boff += len;
102	}
103
104	/*
105	 * Fire off all sub-requests.  We get the correct consumer (== drive)
106	 * to send each request to via the subdisk that was stored in
107	 * cbp->bio_caller1.
108	 */
109	cbp = bioq_takefirst(p->bqueue);
110	while (cbp != NULL) {
111		/*
112		 * RAID5 sub-requests need to come in correct order, otherwise
113		 * we trip over the parity, as it might be overwritten by
114		 * another sub-request.  We abuse cbp->bio_caller2 to mark
115		 * potential overlap situations.
116		 */
117		if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) {
118			/* Park the bio on the waiting queue. */
119			cbp->bio_pflags |= GV_BIO_ONHOLD;
120			bioq_disksort(p->wqueue, cbp);
121		} else {
122			s = cbp->bio_caller1;
123			g_io_request(cbp, s->drive_sc->consumer);
124		}
125		cbp = bioq_takefirst(p->bqueue);
126	}
127}
128
129static int
130gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
131    off_t *real_len, int *sdno, int growing)
132{
133	struct gv_sd *s;
134	int i, sdcount;
135	off_t len_left, stripeend, stripeno, stripestart;
136
137	switch (p->org) {
138	case GV_PLEX_CONCAT:
139		/*
140		 * Find the subdisk where this request starts.  The subdisks in
141		 * this list must be ordered by plex_offset.
142		 */
143		i = 0;
144		LIST_FOREACH(s, &p->subdisks, in_plex) {
145			if (s->plex_offset <= boff &&
146			    s->plex_offset + s->size > boff) {
147				*sdno = i;
148				break;
149			}
150			i++;
151		}
152		if (s == NULL || s->drive_sc == NULL)
153			return (GV_ERR_NOTFOUND);
154
155		/* Calculate corresponding offsets on disk. */
156		*real_off = boff - s->plex_offset;
157		len_left = s->size - (*real_off);
158		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
159		*real_len = (bcount > len_left) ? len_left : bcount;
160		break;
161
162	case GV_PLEX_STRIPED:
163		/* The number of the stripe where the request starts. */
164		stripeno = boff / p->stripesize;
165		KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0"));
166
167		/* Take growing subdisks into account when calculating. */
168		sdcount = gv_sdcount(p, (boff >= p->synced));
169
170		if (!(boff + bcount <= p->synced) &&
171		    (p->flags & GV_PLEX_GROWING) &&
172		    !growing)
173			return (GV_ERR_ISBUSY);
174		*sdno = stripeno % sdcount;
175
176		KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0"));
177		stripestart = (stripeno / sdcount) *
178		    p->stripesize;
179		KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0"));
180		stripeend = stripestart + p->stripesize;
181		*real_off = boff - (stripeno * p->stripesize) +
182		    stripestart;
183		len_left = stripeend - *real_off;
184		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
185
186		*real_len = (bcount <= len_left) ? bcount : len_left;
187		break;
188
189	default:
190		return (GV_ERR_PLEXORG);
191	}
192	return (0);
193}
194
195/*
196 * Prepare a normal plex request.
197 */
198static int
199gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff,
200    off_t bcount,  caddr_t addr)
201{
202	struct gv_sd *s;
203	struct bio *cbp;
204	off_t real_len, real_off;
205	int i, err, sdno;
206
207	s = NULL;
208	sdno = -1;
209	real_len = real_off = 0;
210
211	err = ENXIO;
212
213	if (p == NULL || LIST_EMPTY(&p->subdisks))
214		goto bad;
215
216	err = gv_plex_offset(p, boff, bcount, &real_off,
217	    &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW));
218	/* If the request was blocked, put it into wait. */
219	if (err == GV_ERR_ISBUSY) {
220		bioq_disksort(p->rqueue, bp);
221		return (-1); /* "Fail", and delay request. */
222	}
223	if (err) {
224		err = ENXIO;
225		goto bad;
226	}
227	err = ENXIO;
228
229	/* Find the right subdisk. */
230	i = 0;
231	LIST_FOREACH(s, &p->subdisks, in_plex) {
232		if (i == sdno)
233			break;
234		i++;
235	}
236
237	/* Subdisk not found. */
238	if (s == NULL || s->drive_sc == NULL)
239		goto bad;
240
241	/* Now check if we can handle the request on this subdisk. */
242	switch (s->state) {
243	case GV_SD_UP:
244		/* If the subdisk is up, just continue. */
245		break;
246	case GV_SD_DOWN:
247		if (bp->bio_pflags & GV_BIO_INTERNAL)
248			G_VINUM_DEBUG(0, "subdisk must be in the stale state in"
249			    " order to perform administrative requests");
250		goto bad;
251	case GV_SD_STALE:
252		if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) {
253			G_VINUM_DEBUG(0, "subdisk stale, unable to perform "
254			    "regular requests");
255			goto bad;
256		}
257
258		G_VINUM_DEBUG(1, "sd %s is initializing", s->name);
259		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
260		break;
261	case GV_SD_INITIALIZING:
262		if (bp->bio_cmd == BIO_READ)
263			goto bad;
264		break;
265	default:
266		/* All other subdisk states mean it's not accessible. */
267		goto bad;
268	}
269
270	/* Clone the bio and adjust the offsets and sizes. */
271	cbp = g_clone_bio(bp);
272	if (cbp == NULL) {
273		err = ENOMEM;
274		goto bad;
275	}
276	cbp->bio_offset = real_off + s->drive_offset;
277	cbp->bio_length = real_len;
278	cbp->bio_data = addr;
279	cbp->bio_done = gv_done;
280	cbp->bio_caller1 = s;
281	s->drive_sc->active++;
282
283	/* Store the sub-requests now and let others issue them. */
284	bioq_insert_tail(p->bqueue, cbp);
285	return (real_len);
286bad:
287	G_VINUM_LOGREQ(0, bp, "plex request failed.");
288	/* Building the sub-request failed. If internal BIO, do not deliver. */
289	if (bp->bio_pflags & GV_BIO_INTERNAL) {
290		if (bp->bio_pflags & GV_BIO_MALLOC)
291			g_free(bp->bio_data);
292		g_destroy_bio(bp);
293		p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
294		    GV_PLEX_GROWING);
295		return (-1);
296	}
297	g_io_deliver(bp, err);
298	return (-1);
299}
300
301/*
302 * Handle a completed request to a striped or concatenated plex.
303 */
304void
305gv_plex_normal_done(struct gv_plex *p, struct bio *bp)
306{
307	struct bio *pbp;
308
309	pbp = bp->bio_parent;
310	if (pbp->bio_error == 0)
311		pbp->bio_error = bp->bio_error;
312	g_destroy_bio(bp);
313	pbp->bio_inbed++;
314	if (pbp->bio_children == pbp->bio_inbed) {
315		/* Just set it to length since multiple plexes will
316		 * screw things up. */
317		pbp->bio_completed = pbp->bio_length;
318		if (pbp->bio_pflags & GV_BIO_SYNCREQ)
319			gv_sync_complete(p, pbp);
320		else if (pbp->bio_pflags & GV_BIO_GROW)
321			gv_grow_complete(p, pbp);
322		else
323			g_io_deliver(pbp, pbp->bio_error);
324	}
325}
326
327/*
328 * Handle a completed request to a RAID-5 plex.
329 */
330void
331gv_plex_raid5_done(struct gv_plex *p, struct bio *bp)
332{
333	struct gv_softc *sc;
334	struct bio *cbp, *pbp;
335	struct gv_bioq *bq, *bq2;
336	struct gv_raid5_packet *wp;
337	off_t completed;
338	int i;
339
340	completed = 0;
341	sc = p->vinumconf;
342	wp = bp->bio_caller2;
343
344	switch (bp->bio_parent->bio_cmd) {
345	case BIO_READ:
346		if (wp == NULL) {
347			completed = bp->bio_completed;
348			break;
349		}
350
351		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
352			if (bq->bp != bp)
353				continue;
354			TAILQ_REMOVE(&wp->bits, bq, queue);
355			g_free(bq);
356			for (i = 0; i < wp->length; i++)
357				wp->data[i] ^= bp->bio_data[i];
358			break;
359		}
360		if (TAILQ_EMPTY(&wp->bits)) {
361			completed = wp->length;
362			if (wp->lockbase != -1) {
363				TAILQ_REMOVE(&p->packets, wp, list);
364				/* Bring the waiting bios back into the game. */
365				pbp = bioq_takefirst(p->wqueue);
366				while (pbp != NULL) {
367					gv_post_bio(sc, pbp);
368					pbp = bioq_takefirst(p->wqueue);
369				}
370			}
371			g_free(wp);
372		}
373
374		break;
375
376 	case BIO_WRITE:
377		/* XXX can this ever happen? */
378		if (wp == NULL) {
379			completed = bp->bio_completed;
380			break;
381		}
382
383		/* Check if we need to handle parity data. */
384		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
385			if (bq->bp != bp)
386				continue;
387			TAILQ_REMOVE(&wp->bits, bq, queue);
388			g_free(bq);
389			cbp = wp->parity;
390			if (cbp != NULL) {
391				for (i = 0; i < wp->length; i++)
392					cbp->bio_data[i] ^= bp->bio_data[i];
393			}
394			break;
395		}
396
397		/* Handle parity data. */
398		if (TAILQ_EMPTY(&wp->bits)) {
399			if (bp->bio_parent->bio_pflags & GV_BIO_CHECK)
400				i = gv_check_parity(p, bp, wp);
401			else
402				i = gv_normal_parity(p, bp, wp);
403
404			/* All of our sub-requests have finished. */
405			if (i) {
406				completed = wp->length;
407				TAILQ_REMOVE(&p->packets, wp, list);
408				/* Bring the waiting bios back into the game. */
409				pbp = bioq_takefirst(p->wqueue);
410				while (pbp != NULL) {
411					gv_post_bio(sc, pbp);
412					pbp = bioq_takefirst(p->wqueue);
413				}
414				g_free(wp);
415			}
416		}
417
418		break;
419	}
420
421	pbp = bp->bio_parent;
422	if (pbp->bio_error == 0)
423		pbp->bio_error = bp->bio_error;
424	pbp->bio_completed += completed;
425
426	/* When the original request is finished, we deliver it. */
427	pbp->bio_inbed++;
428	if (pbp->bio_inbed == pbp->bio_children) {
429		/* Hand it over for checking or delivery. */
430		if (pbp->bio_cmd == BIO_WRITE &&
431		    (pbp->bio_pflags & GV_BIO_CHECK)) {
432			gv_parity_complete(p, pbp);
433		} else if (pbp->bio_cmd == BIO_WRITE &&
434		    (pbp->bio_pflags & GV_BIO_REBUILD)) {
435			gv_rebuild_complete(p, pbp);
436		} else if (pbp->bio_pflags & GV_BIO_INIT) {
437			gv_init_complete(p, pbp);
438		} else if (pbp->bio_pflags & GV_BIO_SYNCREQ) {
439			gv_sync_complete(p, pbp);
440		} else if (pbp->bio_pflags & GV_BIO_GROW) {
441			gv_grow_complete(p, pbp);
442		} else {
443			g_io_deliver(pbp, pbp->bio_error);
444		}
445	}
446
447	/* Clean up what we allocated. */
448	if (bp->bio_cflags & GV_BIO_MALLOC)
449		g_free(bp->bio_data);
450	g_destroy_bio(bp);
451}
452
453static int
454gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
455{
456	struct bio *pbp;
457	struct gv_sd *s;
458	int err, finished, i;
459
460	err = 0;
461	finished = 1;
462
463	if (wp->waiting != NULL) {
464		pbp = wp->waiting;
465		wp->waiting = NULL;
466		s = pbp->bio_caller1;
467		g_io_request(pbp, s->drive_sc->consumer);
468		finished = 0;
469
470	} else if (wp->parity != NULL) {
471		pbp = wp->parity;
472		wp->parity = NULL;
473
474		/* Check if the parity is correct. */
475		for (i = 0; i < wp->length; i++) {
476			if (bp->bio_data[i] != pbp->bio_data[i]) {
477				err = 1;
478				break;
479			}
480		}
481
482		/* The parity is not correct... */
483		if (err) {
484			bp->bio_parent->bio_error = EAGAIN;
485
486			/* ... but we rebuild it. */
487			if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) {
488				s = pbp->bio_caller1;
489				g_io_request(pbp, s->drive_sc->consumer);
490				finished = 0;
491			}
492		}
493
494		/*
495		 * Clean up the BIO we would have used for rebuilding the
496		 * parity.
497		 */
498		if (finished) {
499			bp->bio_parent->bio_inbed++;
500			g_destroy_bio(pbp);
501		}
502	}
503
504	return (finished);
505}
506
507static int
508gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
509{
510	struct bio *cbp, *pbp;
511	struct gv_sd *s;
512	int finished, i;
513
514	finished = 1;
515
516	if (wp->waiting != NULL) {
517		pbp = wp->waiting;
518		wp->waiting = NULL;
519		cbp = wp->parity;
520		for (i = 0; i < wp->length; i++)
521			cbp->bio_data[i] ^= pbp->bio_data[i];
522		s = pbp->bio_caller1;
523		g_io_request(pbp, s->drive_sc->consumer);
524		finished = 0;
525
526	} else if (wp->parity != NULL) {
527		cbp = wp->parity;
528		wp->parity = NULL;
529		s = cbp->bio_caller1;
530		g_io_request(cbp, s->drive_sc->consumer);
531		finished = 0;
532	}
533
534	return (finished);
535}
536
537/* Flush the queue with delayed requests. */
538static void
539gv_plex_flush(struct gv_plex *p)
540{
541	struct gv_softc *sc;
542	struct bio *bp;
543
544	sc = p->vinumconf;
545	bp = bioq_takefirst(p->rqueue);
546	while (bp != NULL) {
547		gv_plex_start(p, bp);
548		bp = bioq_takefirst(p->rqueue);
549	}
550}
551
552static void
553gv_post_bio(struct gv_softc *sc, struct bio *bp)
554{
555
556	KASSERT(sc != NULL, ("NULL sc"));
557	KASSERT(bp != NULL, ("NULL bp"));
558	mtx_lock(&sc->bqueue_mtx);
559	bioq_disksort(sc->bqueue_down, bp);
560	wakeup(sc);
561	mtx_unlock(&sc->bqueue_mtx);
562}
563
564int
565gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset,
566    off_t length, int type, caddr_t data)
567{
568	struct gv_softc *sc;
569	struct bio *bp;
570
571	KASSERT(from != NULL, ("NULL from"));
572	KASSERT(to != NULL, ("NULL to"));
573	sc = from->vinumconf;
574	KASSERT(sc != NULL, ("NULL sc"));
575
576	bp = g_new_bio();
577	if (bp == NULL) {
578		G_VINUM_DEBUG(0, "sync from '%s' failed at offset "
579		    " %jd; out of memory", from->name, offset);
580		return (ENOMEM);
581	}
582	bp->bio_length = length;
583	bp->bio_done = NULL;
584	bp->bio_pflags |= GV_BIO_SYNCREQ;
585	bp->bio_offset = offset;
586	bp->bio_caller1 = from;
587	bp->bio_caller2 = to;
588	bp->bio_cmd = type;
589	if (data == NULL)
590		data = g_malloc(length, M_WAITOK);
591	bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */
592	bp->bio_data = data;
593
594	/* Send down next. */
595	gv_post_bio(sc, bp);
596	//gv_plex_start(from, bp);
597	return (0);
598}
599
600/*
601 * Handle a finished plex sync bio.
602 */
603int
604gv_sync_complete(struct gv_plex *to, struct bio *bp)
605{
606	struct gv_plex *from, *p;
607	struct gv_sd *s;
608	struct gv_volume *v;
609	struct gv_softc *sc;
610	off_t offset;
611	int err;
612
613	g_topology_assert_not();
614
615	err = 0;
616	KASSERT(to != NULL, ("NULL to"));
617	KASSERT(bp != NULL, ("NULL bp"));
618	from = bp->bio_caller2;
619	KASSERT(from != NULL, ("NULL from"));
620	v = to->vol_sc;
621	KASSERT(v != NULL, ("NULL v"));
622	sc = v->vinumconf;
623	KASSERT(sc != NULL, ("NULL sc"));
624
625	/* If it was a read, write it. */
626	if (bp->bio_cmd == BIO_READ) {
627		err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length,
628	    	    BIO_WRITE, bp->bio_data);
629	/* If it was a write, read the next one. */
630	} else if (bp->bio_cmd == BIO_WRITE) {
631		if (bp->bio_pflags & GV_BIO_MALLOC)
632			g_free(bp->bio_data);
633		to->synced += bp->bio_length;
634		/* If we're finished, clean up. */
635		if (bp->bio_offset + bp->bio_length >= from->size) {
636			G_VINUM_DEBUG(1, "syncing of %s from %s completed",
637			    to->name, from->name);
638			/* Update our state. */
639			LIST_FOREACH(s, &to->subdisks, in_plex)
640				gv_set_sd_state(s, GV_SD_UP, 0);
641			gv_update_plex_state(to);
642			to->flags &= ~GV_PLEX_SYNCING;
643			to->synced = 0;
644			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
645		} else {
646			offset = bp->bio_offset + bp->bio_length;
647			err = gv_sync_request(from, to, offset,
648			    MIN(bp->bio_length, from->size - offset),
649			    BIO_READ, NULL);
650		}
651	}
652	g_destroy_bio(bp);
653	/* Clean up if there was an error. */
654	if (err) {
655		to->flags &= ~GV_PLEX_SYNCING;
656		G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err);
657	}
658
659	/* Check if all plexes are synced, and lower refcounts. */
660	g_topology_lock();
661	LIST_FOREACH(p, &v->plexes, in_volume) {
662		if (p->flags & GV_PLEX_SYNCING) {
663			g_topology_unlock();
664			return (-1);
665		}
666	}
667	/* If we came here, all plexes are synced, and we're free. */
668	gv_access(v->provider, -1, -1, 0);
669	g_topology_unlock();
670	G_VINUM_DEBUG(1, "plex sync completed");
671	gv_volume_flush(v);
672	return (0);
673}
674
675/*
676 * Create a new bio struct for the next grow request.
677 */
678int
679gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type,
680    caddr_t data)
681{
682	struct gv_softc *sc;
683	struct bio *bp;
684
685	KASSERT(p != NULL, ("gv_grow_request: NULL p"));
686	sc = p->vinumconf;
687	KASSERT(sc != NULL, ("gv_grow_request: NULL sc"));
688
689	bp = g_new_bio();
690	if (bp == NULL) {
691		G_VINUM_DEBUG(0, "grow of %s failed creating bio: "
692		    "out of memory", p->name);
693		return (ENOMEM);
694	}
695
696	bp->bio_cmd = type;
697	bp->bio_done = NULL;
698	bp->bio_error = 0;
699	bp->bio_caller1 = p;
700	bp->bio_offset = offset;
701	bp->bio_length = length;
702	bp->bio_pflags |= GV_BIO_GROW;
703	if (data == NULL)
704		data = g_malloc(length, M_WAITOK);
705	bp->bio_pflags |= GV_BIO_MALLOC;
706	bp->bio_data = data;
707
708	gv_post_bio(sc, bp);
709	//gv_plex_start(p, bp);
710	return (0);
711}
712
713/*
714 * Finish handling of a bio to a growing plex.
715 */
716void
717gv_grow_complete(struct gv_plex *p, struct bio *bp)
718{
719	struct gv_softc *sc;
720	struct gv_sd *s;
721	struct gv_volume *v;
722	off_t origsize, offset;
723	int sdcount, err;
724
725	v = p->vol_sc;
726	KASSERT(v != NULL, ("gv_grow_complete: NULL v"));
727	sc = v->vinumconf;
728	KASSERT(sc != NULL, ("gv_grow_complete: NULL sc"));
729	err = 0;
730
731	/* If it was a read, write it. */
732	if (bp->bio_cmd == BIO_READ) {
733		p->synced += bp->bio_length;
734		err = gv_grow_request(p, bp->bio_offset, bp->bio_length,
735		    BIO_WRITE, bp->bio_data);
736	/* If it was a write, read next. */
737	} else if (bp->bio_cmd == BIO_WRITE) {
738		if (bp->bio_pflags & GV_BIO_MALLOC)
739			g_free(bp->bio_data);
740
741		/* Find the real size of the plex. */
742		sdcount = gv_sdcount(p, 1);
743		s = LIST_FIRST(&p->subdisks);
744		KASSERT(s != NULL, ("NULL s"));
745		origsize = (s->size * (sdcount - 1));
746		if (bp->bio_offset + bp->bio_length >= origsize) {
747			G_VINUM_DEBUG(1, "growing of %s completed", p->name);
748			p->flags &= ~GV_PLEX_GROWING;
749			LIST_FOREACH(s, &p->subdisks, in_plex) {
750				s->flags &= ~GV_SD_GROW;
751				gv_set_sd_state(s, GV_SD_UP, 0);
752			}
753			p->size = gv_plex_size(p);
754			gv_update_vol_size(v, gv_vol_size(v));
755			gv_set_plex_state(p, GV_PLEX_UP, 0);
756			g_topology_lock();
757			gv_access(v->provider, -1, -1, 0);
758			g_topology_unlock();
759			p->synced = 0;
760			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
761			/* Issue delayed requests. */
762			gv_plex_flush(p);
763		} else {
764			offset = bp->bio_offset + bp->bio_length;
765			err = gv_grow_request(p, offset,
766			   MIN(bp->bio_length, origsize - offset),
767			   BIO_READ, NULL);
768		}
769	}
770	g_destroy_bio(bp);
771
772	if (err) {
773		p->flags &= ~GV_PLEX_GROWING;
774		G_VINUM_DEBUG(0, "error growing plex: error code %d", err);
775	}
776}
777
778/*
779 * Create an initialization BIO and send it off to the consumer. Assume that
780 * we're given initialization data as parameter.
781 */
782void
783gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length)
784{
785	struct gv_drive *d;
786	struct g_consumer *cp;
787	struct bio *bp, *cbp;
788
789	KASSERT(s != NULL, ("gv_init_request: NULL s"));
790	d = s->drive_sc;
791	KASSERT(d != NULL, ("gv_init_request: NULL d"));
792	cp = d->consumer;
793	KASSERT(cp != NULL, ("gv_init_request: NULL cp"));
794
795	bp = g_new_bio();
796	if (bp == NULL) {
797		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
798		    " (drive offset %jd); out of memory", s->name,
799		    (intmax_t)s->initialized, (intmax_t)start);
800		return; /* XXX: Error codes. */
801	}
802	bp->bio_cmd = BIO_WRITE;
803	bp->bio_data = data;
804	bp->bio_done = NULL;
805	bp->bio_error = 0;
806	bp->bio_length = length;
807	bp->bio_pflags |= GV_BIO_INIT;
808	bp->bio_offset = start;
809	bp->bio_caller1 = s;
810
811	/* Then ofcourse, we have to clone it. */
812	cbp = g_clone_bio(bp);
813	if (cbp == NULL) {
814		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
815		    " (drive offset %jd); out of memory", s->name,
816		    (intmax_t)s->initialized, (intmax_t)start);
817		return; /* XXX: Error codes. */
818	}
819	cbp->bio_done = gv_done;
820	cbp->bio_caller1 = s;
821	d->active++;
822	/* Send it off to the consumer. */
823	g_io_request(cbp, cp);
824}
825
826/*
827 * Handle a finished initialization BIO.
828 */
829void
830gv_init_complete(struct gv_plex *p, struct bio *bp)
831{
832	struct gv_softc *sc;
833	struct gv_drive *d;
834	struct g_consumer *cp;
835	struct gv_sd *s;
836	off_t start, length;
837	caddr_t data;
838	int error;
839
840	s = bp->bio_caller1;
841	start = bp->bio_offset;
842	length = bp->bio_length;
843	error = bp->bio_error;
844	data = bp->bio_data;
845
846	KASSERT(s != NULL, ("gv_init_complete: NULL s"));
847	d = s->drive_sc;
848	KASSERT(d != NULL, ("gv_init_complete: NULL d"));
849	cp = d->consumer;
850	KASSERT(cp != NULL, ("gv_init_complete: NULL cp"));
851	sc = p->vinumconf;
852	KASSERT(sc != NULL, ("gv_init_complete: NULL sc"));
853
854	g_destroy_bio(bp);
855
856	/*
857	 * First we need to find out if it was okay, and abort if it's not.
858	 * Then we need to free previous buffers, find out the correct subdisk,
859	 * as well as getting the correct starting point and length of the BIO.
860	 */
861	if (start >= s->drive_offset + s->size) {
862		/* Free the data we initialized. */
863		if (data != NULL)
864			g_free(data);
865		g_topology_assert_not();
866		g_topology_lock();
867		g_access(cp, 0, -1, 0);
868		g_topology_unlock();
869		if (error) {
870			gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE |
871			    GV_SETSTATE_CONFIG);
872		} else {
873			gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG);
874			s->initialized = 0;
875			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
876			G_VINUM_DEBUG(1, "subdisk '%s' init: finished "
877			    "successfully", s->name);
878		}
879		return;
880	}
881	s->initialized += length;
882	start += length;
883	gv_init_request(s, start, data, length);
884}
885
886/*
887 * Create a new bio struct for the next parity rebuild. Used both by internal
888 * rebuild of degraded plexes as well as user initiated rebuilds/checks.
889 */
890void
891gv_parity_request(struct gv_plex *p, int flags, off_t offset)
892{
893	struct gv_softc *sc;
894	struct bio *bp;
895
896	KASSERT(p != NULL, ("gv_parity_request: NULL p"));
897	sc = p->vinumconf;
898	KASSERT(sc != NULL, ("gv_parity_request: NULL sc"));
899
900	bp = g_new_bio();
901	if (bp == NULL) {
902		G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: "
903		    "out of memory", p->name);
904		return;
905	}
906
907	bp->bio_cmd = BIO_WRITE;
908	bp->bio_done = NULL;
909	bp->bio_error = 0;
910	bp->bio_length = p->stripesize;
911	bp->bio_caller1 = p;
912
913	/*
914	 * Check if it's a rebuild of a degraded plex or a user request of
915	 * parity rebuild.
916	 */
917	if (flags & GV_BIO_REBUILD)
918		bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK);
919	else if (flags & GV_BIO_CHECK)
920		bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO);
921	else {
922		G_VINUM_DEBUG(0, "invalid flags given in rebuild");
923		return;
924	}
925
926	bp->bio_pflags = flags;
927	bp->bio_pflags |= GV_BIO_MALLOC;
928
929	/* We still have more parity to build. */
930	bp->bio_offset = offset;
931	gv_post_bio(sc, bp);
932	//gv_plex_start(p, bp); /* Send it down to the plex. */
933}
934
935/*
936 * Handle a finished parity write.
937 */
938void
939gv_parity_complete(struct gv_plex *p, struct bio *bp)
940{
941	struct gv_softc *sc;
942	int error, flags;
943
944	error = bp->bio_error;
945	flags = bp->bio_pflags;
946	flags &= ~GV_BIO_MALLOC;
947
948	sc = p->vinumconf;
949	KASSERT(sc != NULL, ("gv_parity_complete: NULL sc"));
950
951	/* Clean up what we allocated. */
952	if (bp->bio_pflags & GV_BIO_MALLOC)
953		g_free(bp->bio_data);
954	g_destroy_bio(bp);
955
956	if (error == EAGAIN) {
957		G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx",
958		    (intmax_t)p->synced);
959	}
960
961	/* Any error is fatal, except EAGAIN when we're rebuilding. */
962	if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) {
963		/* Make sure we don't have the lock. */
964		g_topology_assert_not();
965		g_topology_lock();
966		gv_access(p->vol_sc->provider, -1, -1, 0);
967		g_topology_unlock();
968		G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx "
969		    "errno %d", p->name, (intmax_t)p->synced, error);
970		return;
971	} else {
972		p->synced += p->stripesize;
973	}
974
975	if (p->synced >= p->size) {
976		/* Make sure we don't have the lock. */
977		g_topology_assert_not();
978		g_topology_lock();
979		gv_access(p->vol_sc->provider, -1, -1, 0);
980		g_topology_unlock();
981		/* We're finished. */
982		G_VINUM_DEBUG(1, "parity operation on %s finished", p->name);
983		p->synced = 0;
984		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
985		return;
986	}
987
988	/* Send down next. It will determine if we need to itself. */
989	gv_parity_request(p, flags, p->synced);
990}
991
992/*
993 * Handle a finished plex rebuild bio.
994 */
995void
996gv_rebuild_complete(struct gv_plex *p, struct bio *bp)
997{
998	struct gv_softc *sc;
999	struct gv_sd *s;
1000	int error, flags;
1001	off_t offset;
1002
1003	error = bp->bio_error;
1004	flags = bp->bio_pflags;
1005	offset = bp->bio_offset;
1006	flags &= ~GV_BIO_MALLOC;
1007	sc = p->vinumconf;
1008	KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc"));
1009
1010	/* Clean up what we allocated. */
1011	if (bp->bio_pflags & GV_BIO_MALLOC)
1012		g_free(bp->bio_data);
1013	g_destroy_bio(bp);
1014
1015	if (error) {
1016		g_topology_assert_not();
1017		g_topology_lock();
1018		gv_access(p->vol_sc->provider, -1, -1, 0);
1019		g_topology_unlock();
1020
1021		G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d",
1022		    p->name, (intmax_t)offset, error);
1023		p->flags &= ~GV_PLEX_REBUILDING;
1024		p->synced = 0;
1025		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1026		return;
1027	}
1028
1029	offset += (p->stripesize * (gv_sdcount(p, 1) - 1));
1030	if (offset >= p->size) {
1031		/* We're finished. */
1032		g_topology_assert_not();
1033		g_topology_lock();
1034		gv_access(p->vol_sc->provider, -1, -1, 0);
1035		g_topology_unlock();
1036
1037		G_VINUM_DEBUG(1, "rebuild of %s finished", p->name);
1038		gv_save_config(p->vinumconf);
1039		p->flags &= ~GV_PLEX_REBUILDING;
1040		p->synced = 0;
1041		/* Try to up all subdisks. */
1042		LIST_FOREACH(s, &p->subdisks, in_plex)
1043			gv_update_sd_state(s);
1044		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
1045		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1046		return;
1047	}
1048
1049	/* Send down next. It will determine if we need to itself. */
1050	gv_parity_request(p, flags, offset);
1051}
1052