geom_vinum_plex.c revision 135426
1130389Sle/*-
2130389Sle * Copyright (c) 2004 Lukas Ertl
3130389Sle * All rights reserved.
4130389Sle *
5130389Sle * Redistribution and use in source and binary forms, with or without
6130389Sle * modification, are permitted provided that the following conditions
7130389Sle * are met:
8130389Sle * 1. Redistributions of source code must retain the above copyright
9130389Sle *    notice, this list of conditions and the following disclaimer.
10130389Sle * 2. Redistributions in binary form must reproduce the above copyright
11130389Sle *    notice, this list of conditions and the following disclaimer in the
12130389Sle *    documentation and/or other materials provided with the distribution.
13130389Sle *
14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17130389Sle * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24130389Sle * SUCH DAMAGE.
25130389Sle */
26130389Sle
27130389Sle#include <sys/cdefs.h>
28130389Sle__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_plex.c 135426 2004-09-18 13:44:43Z le $");
29130389Sle
30130389Sle#include <sys/param.h>
31130389Sle#include <sys/bio.h>
32130389Sle#include <sys/kernel.h>
33130389Sle#include <sys/kthread.h>
34130389Sle#include <sys/libkern.h>
35130389Sle#include <sys/lock.h>
36130389Sle#include <sys/malloc.h>
37130389Sle#include <sys/module.h>
38130389Sle#include <sys/mutex.h>
39130389Sle#include <sys/systm.h>
40130389Sle
41130389Sle#include <geom/geom.h>
42130389Sle#include <geom/vinum/geom_vinum_var.h>
43130389Sle#include <geom/vinum/geom_vinum_raid5.h>
44130389Sle#include <geom/vinum/geom_vinum.h>
45130389Sle
46135426Slestatic void gv_plex_completed_request(struct gv_plex *, struct bio *);
47135426Slestatic void gv_plex_normal_request(struct gv_plex *, struct bio *);
48135426Slestatic void gv_plex_worker(void *);
49135426Sle
50130389Sle/* XXX: is this the place to catch dying subdisks? */
51130389Slestatic void
52130389Slegv_plex_orphan(struct g_consumer *cp)
53130389Sle{
54130389Sle	struct g_geom *gp;
55130389Sle	struct gv_plex *p;
56130389Sle	int error;
57130389Sle
58130389Sle	g_topology_assert();
59130389Sle	gp = cp->geom;
60130389Sle	g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name);
61130389Sle
62130389Sle	if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0)
63130389Sle		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
64130389Sle	error = cp->provider->error;
65130389Sle	if (error == 0)
66130389Sle		error = ENXIO;
67130389Sle	g_detach(cp);
68130389Sle	g_destroy_consumer(cp);
69130389Sle	if (!LIST_EMPTY(&gp->consumer))
70130389Sle		return;
71130389Sle
72130389Sle	p = gp->softc;
73130697Sle	if (p != NULL) {
74135164Sle		gv_kill_plex_thread(p);
75130697Sle		p->geom = NULL;
76130697Sle		p->provider = NULL;
77130697Sle		p->consumer = NULL;
78130697Sle	}
79130597Sle	gp->softc = NULL;
80130389Sle	g_wither_geom(gp, error);
81130389Sle}
82130389Sle
83135426Slevoid
84130389Slegv_plex_done(struct bio *bp)
85130389Sle{
86135426Sle	struct gv_plex *p;
87135426Sle	struct gv_bioq *bq;
88130389Sle
89135426Sle	p = bp->bio_from->geom->softc;
90135426Sle	bp->bio_cflags |= GV_BIO_DONE;
91135426Sle	bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
92135426Sle	bq->bp = bp;
93135426Sle	mtx_lock(&p->bqueue_mtx);
94135426Sle	TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
95135426Sle	wakeup(p);
96135426Sle	mtx_unlock(&p->bqueue_mtx);
97130389Sle}
98130389Sle
99130389Sle/* Find the correct subdisk to send the bio to and build a bio to send. */
100130389Slestatic int
101135426Slegv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
102130389Sle{
103130389Sle	struct g_geom *gp;
104130389Sle	struct gv_sd *s;
105135426Sle	struct bio *cbp, *pbp;
106130389Sle	int i, sdno;
107135426Sle	off_t len_left, real_len, real_off;
108135426Sle	off_t stripeend, stripeno, stripestart;
109130389Sle
110135426Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
111135426Sle		return (ENXIO);
112135426Sle
113130389Sle	s = NULL;
114130389Sle	gp = bp->bio_to->geom;
115130389Sle
116130389Sle	/*
117130389Sle	 * We only handle concatenated and striped plexes here.  RAID5 plexes
118130389Sle	 * are handled in build_raid5_request().
119130389Sle	 */
120130389Sle	switch (p->org) {
121130389Sle	case GV_PLEX_CONCAT:
122130389Sle		/*
123130389Sle		 * Find the subdisk where this request starts.  The subdisks in
124130389Sle		 * this list must be ordered by plex_offset.
125130389Sle		 */
126130389Sle		LIST_FOREACH(s, &p->subdisks, in_plex) {
127130389Sle			if (s->plex_offset <= boff &&
128130389Sle			    s->plex_offset + s->size > boff)
129130389Sle				break;
130130389Sle		}
131130389Sle		/* Subdisk not found. */
132130389Sle		if (s == NULL)
133130389Sle			return (ENXIO);
134130389Sle
135130389Sle		/* Calculate corresponding offsets on disk. */
136130389Sle		real_off = boff - s->plex_offset;
137130389Sle		len_left = s->size - real_off;
138130389Sle		real_len = (bcount > len_left) ? len_left : bcount;
139130389Sle		break;
140130389Sle
141130389Sle	case GV_PLEX_STRIPED:
142130389Sle		/* The number of the stripe where the request starts. */
143130389Sle		stripeno = boff / p->stripesize;
144130389Sle
145130389Sle		/* The number of the subdisk where the stripe resides. */
146130389Sle		sdno = stripeno % p->sdcount;
147130389Sle
148130389Sle		/* Find the right subdisk. */
149130389Sle		i = 0;
150130389Sle		LIST_FOREACH(s, &p->subdisks, in_plex) {
151130389Sle			if (i == sdno)
152130389Sle				break;
153130389Sle			i++;
154130389Sle		}
155130389Sle
156130389Sle		/* Subdisk not found. */
157130389Sle		if (s == NULL)
158130389Sle			return (ENXIO);
159130389Sle
160130389Sle		/* The offset of the stripe from the start of the subdisk. */
161130389Sle		stripestart = (stripeno / p->sdcount) *
162130389Sle		    p->stripesize;
163130389Sle
164130389Sle		/* The offset at the end of the stripe. */
165130389Sle		stripeend = stripestart + p->stripesize;
166130389Sle
167130389Sle		/* The offset of the request on this subdisk. */
168130389Sle		real_off = boff - (stripeno * p->stripesize) +
169130389Sle		    stripestart;
170130389Sle
171130389Sle		/* The length left in this stripe. */
172130389Sle		len_left = stripeend - real_off;
173130389Sle
174130389Sle		real_len = (bcount <= len_left) ? bcount : len_left;
175130389Sle		break;
176130389Sle
177130389Sle	default:
178130389Sle		return (EINVAL);
179130389Sle	}
180130389Sle
181130389Sle	/* Now check if we can handle the request on this subdisk. */
182130389Sle	switch (s->state) {
183130389Sle	case GV_SD_UP:
184130389Sle		/* If the subdisk is up, just continue. */
185130389Sle		break;
186130389Sle
187130389Sle	case GV_SD_STALE:
188135426Sle		if (!(bp->bio_cflags & GV_BIO_SYNCREQ))
189130389Sle			return (ENXIO);
190130389Sle
191135426Sle		printf("GEOM_VINUM: sd %s is initializing\n", s->name);
192130389Sle		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
193130389Sle		break;
194130389Sle
195130389Sle	case GV_SD_INITIALIZING:
196130389Sle		if (bp->bio_cmd == BIO_READ)
197130389Sle			return (ENXIO);
198130389Sle		break;
199130389Sle
200130389Sle	default:
201130389Sle		/* All other subdisk states mean it's not accessible. */
202130389Sle		return (ENXIO);
203130389Sle	}
204130389Sle
205130389Sle	/* Clone the bio and adjust the offsets and sizes. */
206130389Sle	cbp = g_clone_bio(bp);
207130389Sle	if (cbp == NULL)
208130389Sle		return (ENOMEM);
209130389Sle	cbp->bio_offset = real_off;
210130389Sle	cbp->bio_length = real_len;
211130389Sle	cbp->bio_data = addr;
212135426Sle	cbp->bio_done = g_std_done;
213135426Sle	cbp->bio_caller2 = s->consumer;
214135426Sle	if ((bp->bio_cflags & GV_BIO_SYNCREQ)) {
215135426Sle		cbp->bio_cflags |= GV_BIO_SYNCREQ;
216130389Sle		cbp->bio_done = gv_plex_done;
217135426Sle	}
218135426Sle
219135426Sle	if (bp->bio_driver1 == NULL) {
220135426Sle		bp->bio_driver1 = cbp;
221135426Sle	} else {
222135426Sle		pbp = bp->bio_driver1;
223135426Sle		while (pbp->bio_caller1 != NULL)
224135426Sle			pbp = pbp->bio_caller1;
225135426Sle		pbp->bio_caller1 = cbp;
226135426Sle	}
227135426Sle
228130389Sle	return (0);
229130389Sle}
230130389Sle
231130389Slestatic void
232130389Slegv_plex_start(struct bio *bp)
233130389Sle{
234130389Sle	struct gv_plex *p;
235135426Sle	struct gv_bioq *bq;
236130389Sle
237135426Sle	switch(bp->bio_cmd) {
238135426Sle	case BIO_READ:
239135426Sle	case BIO_WRITE:
240135426Sle	case BIO_DELETE:
241135426Sle		break;
242135426Sle	case BIO_GETATTR:
243135426Sle	default:
244135426Sle		g_io_deliver(bp, EOPNOTSUPP);
245135426Sle		return;
246135426Sle	}
247130389Sle
248130389Sle	/*
249130389Sle	 * We cannot handle this request if too many of our subdisks are
250130389Sle	 * inaccessible.
251130389Sle	 */
252135426Sle	p = bp->bio_to->geom->softc;
253135426Sle	if ((p->state < GV_PLEX_DEGRADED) &&
254135426Sle	    !(bp->bio_cflags & GV_BIO_SYNCREQ)) {
255135426Sle		g_io_deliver(bp, ENXIO);
256130389Sle		return;
257130389Sle	}
258130389Sle
259135426Sle	bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
260135426Sle	bq->bp = bp;
261135426Sle	mtx_lock(&p->bqueue_mtx);
262135426Sle	TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
263135426Sle	wakeup(p);
264135426Sle	mtx_unlock(&p->bqueue_mtx);
265135426Sle}
266135426Sle
267135426Slestatic void
268135426Slegv_plex_worker(void *arg)
269135426Sle{
270135426Sle	struct bio *bp;
271135426Sle	struct gv_plex *p;
272135426Sle	struct gv_sd *s;
273135426Sle	struct gv_bioq *bq;
274135426Sle
275135426Sle	p = arg;
276135426Sle	KASSERT(p != NULL, ("NULL p"));
277135426Sle
278135426Sle	mtx_lock(&p->bqueue_mtx);
279135426Sle	for (;;) {
280135426Sle		/* We were signaled to exit. */
281135426Sle		if (p->flags & GV_PLEX_THREAD_DIE)
282135426Sle			break;
283135426Sle
284135426Sle		/* Take the first BIO from our queue. */
285135426Sle		bq = TAILQ_FIRST(&p->bqueue);
286135426Sle		if (bq == NULL) {
287135426Sle			msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10);
288135426Sle			continue;
289135426Sle		}
290135426Sle		TAILQ_REMOVE(&p->bqueue, bq, queue);
291135426Sle		mtx_unlock(&p->bqueue_mtx);
292135426Sle
293135426Sle		bp = bq->bp;
294135426Sle
295135426Sle		/* A completed request. */
296135426Sle		if (bp->bio_cflags & GV_BIO_DONE) {
297135426Sle			g_free(bq);
298135426Sle			if (bp->bio_cflags & GV_BIO_SYNCREQ) {
299135426Sle				s = bp->bio_to->private;
300135426Sle				if (bp->bio_error == 0)
301135426Sle					s->initialized += bp->bio_length;
302135426Sle				if (s->initialized >= s->size) {
303135426Sle					g_topology_lock();
304135426Sle					gv_set_sd_state(s, GV_SD_UP,
305135426Sle					    GV_SETSTATE_CONFIG);
306135426Sle					g_topology_unlock();
307135426Sle					s->initialized = 0;
308135426Sle				}
309135426Sle				g_std_done(bp);
310135426Sle			} else
311135426Sle				gv_plex_completed_request(p, bp);
312130389Sle		/*
313135426Sle		 * A sub-request that was hold back because it interfered with
314135426Sle		 * another sub-request.
315130389Sle		 */
316135426Sle		} else if (bp->bio_cflags & GV_BIO_ONHOLD) {
317135426Sle			/* Is it still locked out? */
318135426Sle			if (gv_stripe_active(p, bp)) {
319135426Sle				mtx_lock(&p->bqueue_mtx);
320135426Sle				TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
321135426Sle				mtx_unlock(&p->bqueue_mtx);
322135426Sle			} else {
323135426Sle				g_free(bq);
324135426Sle				bp->bio_cflags &= ~GV_BIO_ONHOLD;
325135426Sle				g_io_request(bp, bp->bio_caller2);
326135426Sle			}
327130389Sle
328135426Sle		/* A normal request to this plex. */
329135426Sle		} else {
330135426Sle			g_free(bq);
331135426Sle			gv_plex_normal_request(p, bp);
332135426Sle		}
333135426Sle
334135426Sle		mtx_lock(&p->bqueue_mtx);
335135426Sle	}
336135426Sle	mtx_unlock(&p->bqueue_mtx);
337135426Sle	p->flags |= GV_PLEX_THREAD_DEAD;
338135426Sle	wakeup(p);
339135426Sle
340135426Sle	kthread_exit(ENXIO);
341135426Sle}
342135426Sle
343135426Slevoid
344135426Slegv_plex_completed_request(struct gv_plex *p, struct bio *bp)
345135426Sle{
346135426Sle	struct bio *cbp, *pbp;
347135426Sle	struct gv_bioq *bq, *bq2;
348135426Sle	struct gv_raid5_packet *wp;
349135426Sle	int i;
350135426Sle
351135426Sle	wp = bp->bio_driver1;
352135426Sle
353135426Sle	switch (bp->bio_parent->bio_cmd) {
354135426Sle	case BIO_READ:
355135426Sle		if (wp == NULL)
356135426Sle			break;
357135426Sle
358135426Sle		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
359135426Sle			if (bq->bp == bp) {
360135426Sle				TAILQ_REMOVE(&wp->bits, bq, queue);
361135426Sle				g_free(bq);
362135426Sle				for (i = 0; i < wp->length; i++)
363135426Sle					wp->data[i] ^= bp->bio_data[i];
364135426Sle				break;
365130389Sle			}
366135426Sle		}
367135426Sle		if (TAILQ_EMPTY(&wp->bits)) {
368135426Sle			bp->bio_parent->bio_completed += wp->length;
369135426Sle			if (wp->lockbase != -1)
370135426Sle				TAILQ_REMOVE(&p->packets, wp, list);
371135426Sle			g_free(wp);
372135426Sle		}
373130389Sle
374135426Sle		break;
375135426Sle
376135426Sle 	case BIO_WRITE:
377135426Sle		if (wp == NULL)
378135426Sle			break;
379135426Sle
380135426Sle		/* Check if we need to handle parity data. */
381135426Sle		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
382135426Sle			if (bq->bp == bp) {
383135426Sle				TAILQ_REMOVE(&wp->bits, bq, queue);
384135426Sle				g_free(bq);
385135426Sle				cbp = wp->parity;
386135426Sle				if (cbp != NULL) {
387135426Sle					for (i = 0; i < wp->length; i++)
388135426Sle						cbp->bio_data[i] ^=
389135426Sle						    bp->bio_data[i];
390135426Sle				}
391135426Sle				break;
392135426Sle			}
393135426Sle		}
394135426Sle
395135426Sle		/* Handle parity data. */
396135426Sle		if (TAILQ_EMPTY(&wp->bits)) {
397135426Sle			if (wp->waiting != NULL) {
398135426Sle				pbp = wp->waiting;
399135426Sle				wp->waiting = NULL;
400135426Sle				cbp = wp->parity;
401135426Sle				for (i = 0; i < wp->length; i++)
402135426Sle					cbp->bio_data[i] ^= pbp->bio_data[i];
403135426Sle				g_io_request(pbp, pbp->bio_caller2);
404135426Sle			} else if (wp->parity != NULL) {
405135426Sle				cbp = wp->parity;
406135426Sle				wp->parity = NULL;
407135426Sle				g_io_request(cbp, cbp->bio_caller2);
408130389Sle			} else {
409135426Sle				bp->bio_parent->bio_completed += wp->length;
410135426Sle				TAILQ_REMOVE(&p->packets, wp, list);
411135426Sle				g_free(wp);
412130389Sle			}
413135426Sle		}
414130389Sle
415135426Sle		break;
416135426Sle	}
417135426Sle
418135426Sle	pbp = bp->bio_parent;
419135426Sle	if (pbp->bio_error == 0)
420135426Sle		pbp->bio_error = bp->bio_error;
421135426Sle
422135426Sle	/* When the original request is finished, we deliver it. */
423135426Sle	pbp->bio_inbed++;
424135426Sle	if (pbp->bio_inbed == pbp->bio_children)
425135426Sle		g_io_deliver(pbp, pbp->bio_error);
426135426Sle
427135426Sle	/* Clean up what we allocated. */
428135426Sle	if (bp->bio_cflags & GV_BIO_MALLOC)
429135426Sle		g_free(bp->bio_data);
430135426Sle	g_destroy_bio(bp);
431135426Sle}
432135426Sle
433135426Slevoid
434135426Slegv_plex_normal_request(struct gv_plex *p, struct bio *bp)
435135426Sle{
436135426Sle	struct bio *cbp, *pbp;
437135426Sle	struct gv_bioq *bq, *bq2;
438135426Sle	struct gv_raid5_packet *wp, *wp2;
439135426Sle	caddr_t addr;
440135426Sle	off_t bcount, boff;
441135426Sle	int err;
442135426Sle
443135426Sle	bcount = bp->bio_length;
444135426Sle	addr = bp->bio_data;
445135426Sle	boff = bp->bio_offset;
446135426Sle
447135426Sle	/* Walk over the whole length of the request, we might split it up. */
448135426Sle	while (bcount > 0) {
449135426Sle		wp = NULL;
450135426Sle
451135426Sle 		/*
452135426Sle		 * RAID5 plexes need special treatment, as a single write
453135426Sle		 * request involves several read/write sub-requests.
454135426Sle 		 */
455135426Sle		if (p->org == GV_PLEX_RAID5) {
456135426Sle			wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
457135426Sle			wp->bio = bp;
458135426Sle			TAILQ_INIT(&wp->bits);
459135426Sle
460135426Sle			err = gv_build_raid5_req(p, wp, bp, addr, boff, bcount);
461135426Sle
462135426Sle 			/*
463135426Sle			 * Building the sub-request failed, we probably need to
464135426Sle			 * clean up a lot.
465135426Sle 			 */
466135426Sle 			if (err) {
467135426Sle				printf("GEOM_VINUM: plex request failed for ");
468135426Sle				g_print_bio(bp);
469135426Sle				printf("\n");
470135426Sle				TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
471135426Sle					TAILQ_REMOVE(&wp->bits, bq, queue);
472135426Sle					g_free(bq);
473135426Sle				}
474135426Sle				if (wp->waiting != NULL) {
475135426Sle					if (wp->waiting->bio_cflags &
476135426Sle					    GV_BIO_MALLOC)
477135426Sle						g_free(wp->waiting->bio_data);
478135426Sle					g_destroy_bio(wp->waiting);
479135426Sle				}
480135426Sle				if (wp->parity != NULL) {
481135426Sle					if (wp->parity->bio_cflags &
482135426Sle					    GV_BIO_MALLOC)
483135426Sle						g_free(wp->parity->bio_data);
484135426Sle					g_destroy_bio(wp->parity);
485135426Sle				}
486135426Sle				g_free(wp);
487135426Sle
488135426Sle				TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
489135426Sle					if (wp->bio == bp) {
490135426Sle						TAILQ_REMOVE(&p->packets, wp,
491135426Sle						    list);
492135426Sle						TAILQ_FOREACH_SAFE(bq,
493135426Sle						    &wp->bits, queue, bq2) {
494135426Sle							TAILQ_REMOVE(&wp->bits,
495135426Sle							    bq, queue);
496135426Sle							g_free(bq);
497135426Sle						}
498135426Sle						g_free(wp);
499135426Sle					}
500135426Sle				}
501135426Sle
502135426Sle				cbp = bp->bio_driver1;
503135426Sle				while (cbp != NULL) {
504135426Sle					pbp = cbp->bio_caller1;
505135426Sle					if (cbp->bio_cflags & GV_BIO_MALLOC)
506135426Sle						g_free(cbp->bio_data);
507135426Sle					g_destroy_bio(cbp);
508135426Sle					cbp = pbp;
509135426Sle				}
510135426Sle
511135426Sle				g_io_deliver(bp, err);
512135426Sle 				return;
513135426Sle 			}
514135426Sle
515135426Sle			if (TAILQ_EMPTY(&wp->bits))
516135426Sle				g_free(wp);
517135426Sle			else if (wp->lockbase != -1)
518135426Sle				TAILQ_INSERT_TAIL(&p->packets, wp, list);
519135426Sle
520135426Sle		/*
521135426Sle		 * Requests to concatenated and striped plexes go straight
522135426Sle		 * through.
523135426Sle		 */
524135426Sle		} else {
525135426Sle			err = gv_plexbuffer(p, bp, addr, boff, bcount);
526135426Sle
527135426Sle			/* Building the sub-request failed. */
528135426Sle			if (err) {
529135426Sle				printf("GEOM_VINUM: plex request failed for ");
530135426Sle				g_print_bio(bp);
531135426Sle				printf("\n");
532135426Sle				cbp = bp->bio_driver1;
533135426Sle				while (cbp != NULL) {
534135426Sle					pbp = cbp->bio_caller1;
535135426Sle					g_destroy_bio(cbp);
536135426Sle					cbp = pbp;
537135426Sle				}
538135426Sle				g_io_deliver(bp, err);
539135426Sle				return;
540135426Sle			}
541130389Sle		}
542135426Sle
543135426Sle		/* Abuse bio_caller1 as linked list. */
544135426Sle		pbp = bp->bio_driver1;
545135426Sle		while (pbp->bio_caller1 != NULL)
546135426Sle			pbp = pbp->bio_caller1;
547135426Sle		bcount -= pbp->bio_length;
548135426Sle		addr += pbp->bio_length;
549135426Sle		boff += pbp->bio_length;
550135426Sle	}
551130389Sle
552135426Sle	/* Fire off all sub-requests. */
553135426Sle	pbp = bp->bio_driver1;
554135426Sle	while (pbp != NULL) {
555135426Sle		/*
556135426Sle		 * RAID5 sub-requests need to come in correct order, otherwise
557135426Sle		 * we trip over the parity, as it might be overwritten by
558135426Sle		 * another sub-request.
559135426Sle		 */
560135426Sle		if (pbp->bio_driver1 != NULL &&
561135426Sle		    gv_stripe_active(p, pbp)) {
562135426Sle			pbp->bio_cflags |= GV_BIO_ONHOLD;
563135426Sle			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
564135426Sle			bq->bp = pbp;
565135426Sle			mtx_lock(&p->bqueue_mtx);
566135426Sle			TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
567135426Sle			mtx_unlock(&p->bqueue_mtx);
568135426Sle		} else
569135426Sle			g_io_request(pbp, pbp->bio_caller2);
570135426Sle		pbp = pbp->bio_caller1;
571130389Sle	}
572130389Sle}
573130389Sle
574130389Slestatic int
575130389Slegv_plex_access(struct g_provider *pp, int dr, int dw, int de)
576130389Sle{
577130389Sle	struct g_geom *gp;
578130389Sle	struct g_consumer *cp, *cp2;
579130389Sle	int error;
580130389Sle
581130389Sle	gp = pp->geom;
582130389Sle
583130389Sle	error = ENXIO;
584130389Sle	LIST_FOREACH(cp, &gp->consumer, consumer) {
585130389Sle		error = g_access(cp, dr, dw, de);
586130389Sle		if (error) {
587130389Sle			LIST_FOREACH(cp2, &gp->consumer, consumer) {
588130389Sle				if (cp == cp2)
589130389Sle					break;
590130389Sle				g_access(cp2, -dr, -dw, -de);
591130389Sle			}
592130389Sle			return (error);
593130389Sle		}
594130389Sle	}
595130389Sle	return (error);
596130389Sle}
597130389Sle
598130389Slestatic struct g_geom *
599130389Slegv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
600130389Sle{
601130389Sle	struct g_geom *gp;
602132906Sle	struct g_consumer *cp, *cp2;
603130389Sle	struct g_provider *pp2;
604130389Sle	struct gv_plex *p;
605130389Sle	struct gv_sd *s;
606130389Sle	struct gv_softc *sc;
607132906Sle	int error;
608130389Sle
609130389Sle	g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name);
610130389Sle	g_topology_assert();
611130389Sle
612130389Sle	/* We only want to attach to subdisks. */
613130389Sle	if (strcmp(pp->geom->class->name, "VINUMDRIVE"))
614130389Sle		return (NULL);
615130389Sle
616130389Sle	/* Find the VINUM class and its associated geom. */
617130389Sle	gp = find_vinum_geom();
618130389Sle	if (gp == NULL)
619130389Sle		return (NULL);
620130389Sle	sc = gp->softc;
621130389Sle	KASSERT(sc != NULL, ("gv_plex_taste: NULL sc"));
622130389Sle
623130389Sle	/* Find out which subdisk the offered provider corresponds to. */
624130389Sle	s = pp->private;
625130389Sle	KASSERT(s != NULL, ("gv_plex_taste: NULL s"));
626130389Sle
627130389Sle	/* Now find the correct plex where this subdisk belongs to. */
628130389Sle	p = gv_find_plex(sc, s->plex);
629130389Sle	KASSERT(p != NULL, ("gv_plex_taste: NULL p"));
630130389Sle
631130389Sle	/*
632130389Sle	 * Add this subdisk to this plex.  Since we trust the on-disk
633130389Sle	 * configuration, we don't check the given value (should we?).
634130389Sle	 * XXX: shouldn't be done here
635130389Sle	 */
636130389Sle	gv_sd_to_plex(p, s, 0);
637130389Sle
638130389Sle	/* Now check if there's already a geom for this plex. */
639130389Sle	gp = p->geom;
640130389Sle
641130389Sle	/* Yes, there is already a geom, so we just add the consumer. */
642130389Sle	if (gp != NULL) {
643132906Sle		cp2 = LIST_FIRST(&gp->consumer);
644130389Sle		/* Need to attach a new consumer to this subdisk. */
645130389Sle		cp = g_new_consumer(gp);
646132906Sle		error = g_attach(cp, pp);
647132906Sle		if (error) {
648132906Sle			printf("geom_vinum: couldn't attach consumer to %s\n",
649132906Sle			    pp->name);
650132906Sle			g_destroy_consumer(cp);
651132906Sle			return (NULL);
652132906Sle		}
653132906Sle		/* Adjust the access counts of the new consumer. */
654132906Sle		if ((cp2 != NULL) && (cp2->acr || cp2->acw || cp2->ace)) {
655132906Sle			error = g_access(cp, cp2->acr, cp2->acw, cp2->ace);
656132906Sle			if (error) {
657132906Sle				printf("geom_vinum: couldn't set access counts"
658132906Sle				    " for consumer on %s\n", pp->name);
659132906Sle				g_detach(cp);
660132906Sle				g_destroy_consumer(cp);
661132906Sle				return (NULL);
662132906Sle			}
663132906Sle		}
664130389Sle		s->consumer = cp;
665130389Sle
666130389Sle		/* Adjust the size of the providers this plex has. */
667130389Sle		LIST_FOREACH(pp2, &gp->provider, provider)
668130389Sle			pp2->mediasize = p->size;
669130389Sle
670132940Sle		/* Update the size of the volume this plex is attached to. */
671132940Sle		if (p->vol_sc != NULL)
672132940Sle			gv_update_vol_size(p->vol_sc, p->size);
673132940Sle
674130389Sle		return (NULL);
675130389Sle
676130389Sle	/* We need to create a new geom. */
677130389Sle	} else {
678130389Sle		gp = g_new_geomf(mp, "%s", p->name);
679130389Sle		gp->start = gv_plex_start;
680130389Sle		gp->orphan = gv_plex_orphan;
681130389Sle		gp->access = gv_plex_access;
682130389Sle		gp->softc = p;
683130389Sle		p->geom = gp;
684130389Sle
685135426Sle		TAILQ_INIT(&p->packets);
686135426Sle		TAILQ_INIT(&p->bqueue);
687135426Sle		mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
688135426Sle		kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",
689135426Sle		    p->name);
690135426Sle		p->flags |= GV_PLEX_THREAD_ACTIVE;
691130389Sle
692130389Sle		/* Attach a consumer to this provider. */
693130389Sle		cp = g_new_consumer(gp);
694130389Sle		g_attach(cp, pp);
695130389Sle		s->consumer = cp;
696130389Sle
697130389Sle		/* Create a provider for the outside world. */
698130389Sle		pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name);
699130389Sle		pp2->mediasize = p->size;
700130389Sle		pp2->sectorsize = pp->sectorsize;
701130389Sle		p->provider = pp2;
702130389Sle		g_error_provider(pp2, 0);
703130389Sle		return (gp);
704130389Sle	}
705130389Sle}
706130389Sle
707130389Slestatic int
708130389Slegv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp,
709130389Sle    struct g_geom *gp)
710130389Sle{
711130389Sle	struct gv_plex *p;
712130389Sle
713130389Sle	g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name);
714130389Sle	g_topology_assert();
715130389Sle
716130389Sle	p = gp->softc;
717130389Sle
718130389Sle	KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name));
719130389Sle
720130389Sle	/*
721130389Sle	 * If this is a RAID5 plex, check if its worker thread is still active
722130389Sle	 * and signal it to self destruct.
723130389Sle	 */
724135164Sle	gv_kill_plex_thread(p);
725130389Sle	/* g_free(sc); */
726130389Sle	g_wither_geom(gp, ENXIO);
727130389Sle	return (0);
728130389Sle}
729130389Sle
730130389Sle#define	VINUMPLEX_CLASS_NAME "VINUMPLEX"
731130389Sle
732130389Slestatic struct g_class g_vinum_plex_class = {
733130389Sle	.name = VINUMPLEX_CLASS_NAME,
734133318Sphk	.version = G_VERSION,
735130389Sle	.taste = gv_plex_taste,
736130389Sle	.destroy_geom = gv_plex_destroy_geom,
737130389Sle};
738130389Sle
739130389SleDECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex);
740