geom_vinum_plex.c revision 133318
1130389Sle/*-
2130389Sle * Copyright (c) 2004 Lukas Ertl
3130389Sle * All rights reserved.
4130389Sle *
5130389Sle * Redistribution and use in source and binary forms, with or without
6130389Sle * modification, are permitted provided that the following conditions
7130389Sle * are met:
8130389Sle * 1. Redistributions of source code must retain the above copyright
9130389Sle *    notice, this list of conditions and the following disclaimer.
10130389Sle * 2. Redistributions in binary form must reproduce the above copyright
11130389Sle *    notice, this list of conditions and the following disclaimer in the
12130389Sle *    documentation and/or other materials provided with the distribution.
13130389Sle *
14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17130389Sle * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24130389Sle * SUCH DAMAGE.
25130389Sle */
26130389Sle
27130389Sle#include <sys/cdefs.h>
28130389Sle__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_plex.c 133318 2004-08-08 07:57:53Z phk $");
29130389Sle
30130389Sle#include <sys/param.h>
31130389Sle#include <sys/bio.h>
32130389Sle#include <sys/kernel.h>
33130389Sle#include <sys/kthread.h>
34130389Sle#include <sys/libkern.h>
35130389Sle#include <sys/lock.h>
36130389Sle#include <sys/malloc.h>
37130389Sle#include <sys/module.h>
38130389Sle#include <sys/mutex.h>
39130389Sle#include <sys/systm.h>
40130389Sle
41130389Sle#include <geom/geom.h>
42130389Sle#include <geom/vinum/geom_vinum_var.h>
43130389Sle#include <geom/vinum/geom_vinum_raid5.h>
44130389Sle#include <geom/vinum/geom_vinum.h>
45130389Sle
46130389Sle/* XXX: is this the place to catch dying subdisks? */
47130389Slestatic void
48130389Slegv_plex_orphan(struct g_consumer *cp)
49130389Sle{
50130389Sle	struct g_geom *gp;
51130389Sle	struct gv_plex *p;
52130389Sle	int error;
53130389Sle
54130389Sle	g_topology_assert();
55130389Sle	gp = cp->geom;
56130389Sle	g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name);
57130389Sle
58130389Sle	if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0)
59130389Sle		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
60130389Sle	error = cp->provider->error;
61130389Sle	if (error == 0)
62130389Sle		error = ENXIO;
63130389Sle	g_detach(cp);
64130389Sle	g_destroy_consumer(cp);
65130389Sle	if (!LIST_EMPTY(&gp->consumer))
66130389Sle		return;
67130389Sle
68130389Sle	p = gp->softc;
69130697Sle	if (p != NULL) {
70130697Sle		gv_kill_thread(p);
71130697Sle		p->geom = NULL;
72130697Sle		p->provider = NULL;
73130697Sle		p->consumer = NULL;
74130697Sle	}
75130597Sle	gp->softc = NULL;
76130389Sle	g_wither_geom(gp, error);
77130389Sle}
78130389Sle
79130389Slestatic void
80130389Slegv_plex_done(struct bio *bp)
81130389Sle{
82130389Sle	struct g_geom *gp;
83130389Sle	struct gv_sd *s;
84130389Sle
85130389Sle	gp = bp->bio_to->geom;
86130389Sle
87130389Sle	s = bp->bio_caller1;
88130389Sle	KASSERT(s != NULL, ("gv_plex_done: NULL s"));
89130389Sle
90130389Sle	if (bp->bio_error == 0)
91130389Sle		s->initialized += bp->bio_length;
92130389Sle
93130389Sle	if (s->initialized >= s->size) {
94130389Sle		gv_set_sd_state(s, GV_SD_UP, 0);
95130389Sle		s->initialized = 0;
96130389Sle	}
97130389Sle
98130389Sle	g_std_done(bp);
99130389Sle}
100130389Sle
101130389Sle/* Find the correct subdisk to send the bio to and build a bio to send. */
102130389Slestatic int
103130389Slegv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
104130389Sle    caddr_t addr, long bcount, off_t boff)
105130389Sle{
106130389Sle	struct g_geom *gp;
107130389Sle	struct gv_plex *p;
108130389Sle	struct gv_sd *s;
109130389Sle	struct bio *cbp;
110130389Sle	int i, sdno;
111130389Sle	off_t len_left, real_len, real_off, stripeend, stripeno, stripestart;
112130389Sle
113130389Sle	s = NULL;
114130389Sle
115130389Sle	gp = bp->bio_to->geom;
116130389Sle	p = gp->softc;
117130389Sle
118130389Sle	if (p == NULL || LIST_EMPTY(&p->subdisks))
119130389Sle		return (ENXIO);
120130389Sle
121130389Sle	/*
122130389Sle	 * We only handle concatenated and striped plexes here.  RAID5 plexes
123130389Sle	 * are handled in build_raid5_request().
124130389Sle	 */
125130389Sle	switch (p->org) {
126130389Sle	case GV_PLEX_CONCAT:
127130389Sle		/*
128130389Sle		 * Find the subdisk where this request starts.  The subdisks in
129130389Sle		 * this list must be ordered by plex_offset.
130130389Sle		 */
131130389Sle		LIST_FOREACH(s, &p->subdisks, in_plex) {
132130389Sle			if (s->plex_offset <= boff &&
133130389Sle			    s->plex_offset + s->size > boff)
134130389Sle				break;
135130389Sle		}
136130389Sle		/* Subdisk not found. */
137130389Sle		if (s == NULL)
138130389Sle			return (ENXIO);
139130389Sle
140130389Sle		/* Calculate corresponding offsets on disk. */
141130389Sle		real_off = boff - s->plex_offset;
142130389Sle		len_left = s->size - real_off;
143130389Sle		real_len = (bcount > len_left) ? len_left : bcount;
144130389Sle		break;
145130389Sle
146130389Sle	case GV_PLEX_STRIPED:
147130389Sle		/* The number of the stripe where the request starts. */
148130389Sle		stripeno = boff / p->stripesize;
149130389Sle
150130389Sle		/* The number of the subdisk where the stripe resides. */
151130389Sle		sdno = stripeno % p->sdcount;
152130389Sle
153130389Sle		/* Find the right subdisk. */
154130389Sle		i = 0;
155130389Sle		LIST_FOREACH(s, &p->subdisks, in_plex) {
156130389Sle			if (i == sdno)
157130389Sle				break;
158130389Sle			i++;
159130389Sle		}
160130389Sle
161130389Sle		/* Subdisk not found. */
162130389Sle		if (s == NULL)
163130389Sle			return (ENXIO);
164130389Sle
165130389Sle		/* The offset of the stripe from the start of the subdisk. */
166130389Sle		stripestart = (stripeno / p->sdcount) *
167130389Sle		    p->stripesize;
168130389Sle
169130389Sle		/* The offset at the end of the stripe. */
170130389Sle		stripeend = stripestart + p->stripesize;
171130389Sle
172130389Sle		/* The offset of the request on this subdisk. */
173130389Sle		real_off = boff - (stripeno * p->stripesize) +
174130389Sle		    stripestart;
175130389Sle
176130389Sle		/* The length left in this stripe. */
177130389Sle		len_left = stripeend - real_off;
178130389Sle
179130389Sle		real_len = (bcount <= len_left) ? bcount : len_left;
180130389Sle		break;
181130389Sle
182130389Sle	default:
183130389Sle		return (EINVAL);
184130389Sle	}
185130389Sle
186130389Sle	/* Now check if we can handle the request on this subdisk. */
187130389Sle	switch (s->state) {
188130389Sle	case GV_SD_UP:
189130389Sle		/* If the subdisk is up, just continue. */
190130389Sle		break;
191130389Sle
192130389Sle	case GV_SD_STALE:
193130389Sle		if (bp->bio_caller1 != p)
194130389Sle			return (ENXIO);
195130389Sle
196130389Sle		printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name);
197130389Sle		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
198130389Sle		break;
199130389Sle
200130389Sle	case GV_SD_INITIALIZING:
201130389Sle		if (bp->bio_cmd == BIO_READ)
202130389Sle			return (ENXIO);
203130389Sle		break;
204130389Sle
205130389Sle	default:
206130389Sle		/* All other subdisk states mean it's not accessible. */
207130389Sle		return (ENXIO);
208130389Sle	}
209130389Sle
210130389Sle	/* Clone the bio and adjust the offsets and sizes. */
211130389Sle	cbp = g_clone_bio(bp);
212130389Sle	if (cbp == NULL)
213130389Sle		return (ENOMEM);
214130389Sle	cbp->bio_offset = real_off;
215130389Sle	cbp->bio_length = real_len;
216130389Sle	cbp->bio_data = addr;
217130389Sle	if (bp->bio_caller1 == p) {
218130389Sle		cbp->bio_caller1 = s;
219130389Sle		cbp->bio_done = gv_plex_done;
220130389Sle	} else
221130389Sle		cbp->bio_done = g_std_done;
222130389Sle	*bp2 = cbp;
223130389Sle	*cp = s->consumer;
224130389Sle	return (0);
225130389Sle}
226130389Sle
227130389Slestatic void
228130389Slegv_plex_start(struct bio *bp)
229130389Sle{
230130389Sle	struct g_geom *gp;
231130389Sle	struct g_consumer *cp;
232130389Sle	struct gv_plex *p;
233130389Sle	struct gv_raid5_packet *wp;
234130389Sle	struct bio *bp2;
235130389Sle	caddr_t addr;
236130389Sle	off_t boff;
237130389Sle	long bcount, rcount;
238130389Sle	int err;
239130389Sle
240130389Sle	gp = bp->bio_to->geom;
241130389Sle	p = gp->softc;
242130389Sle
243130389Sle	/*
244130389Sle	 * We cannot handle this request if too many of our subdisks are
245130389Sle	 * inaccessible.
246130389Sle	 */
247130389Sle	if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) {
248130389Sle		g_io_deliver(bp, ENXIO);  /* XXX: correct way? */
249130389Sle		return;
250130389Sle	}
251130389Sle
252130389Sle	switch(bp->bio_cmd) {
253130389Sle	case BIO_READ:
254130389Sle	case BIO_WRITE:
255130389Sle	case BIO_DELETE:
256130389Sle		/*
257130389Sle		 * We split up the request in smaller packets and hand them
258130389Sle		 * down to our subdisks.
259130389Sle		 */
260130389Sle		wp = NULL;
261130389Sle		addr = bp->bio_data;
262130389Sle		boff = bp->bio_offset;
263130389Sle		for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {
264130389Sle			/*
265130389Sle			 * RAID5 requests usually need to be split up in
266130389Sle			 * several subrequests.
267130389Sle			 */
268130389Sle			if (p->org == GV_PLEX_RAID5) {
269130389Sle				wp = gv_new_raid5_packet();
270130389Sle				wp->bio = bp;
271130389Sle				err = gv_build_raid5_req(wp, bp, addr, bcount,
272130389Sle				    boff);
273130389Sle			} else
274130389Sle				err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount,
275130389Sle				    boff);
276130389Sle
277130389Sle			if (err) {
278131000Sle				if (p->org == GV_PLEX_RAID5)
279131000Sle					gv_free_raid5_packet(wp);
280130389Sle				bp->bio_completed += bcount;
281130389Sle				if (bp->bio_error == 0)
282130389Sle					bp->bio_error = err;
283130389Sle				if (bp->bio_completed == bp->bio_length)
284130389Sle					g_io_deliver(bp, bp->bio_error);
285130389Sle				return;
286130389Sle			}
287130389Sle
288130389Sle			if (p->org != GV_PLEX_RAID5) {
289130389Sle				rcount = bp2->bio_length;
290130389Sle				g_io_request(bp2, cp);
291130389Sle
292130389Sle			/*
293130389Sle			 * RAID5 subrequests are queued on a worklist
294130389Sle			 * and picked up from the worker thread.  This
295130389Sle			 * ensures correct order.
296130389Sle			 */
297130389Sle			} else {
298130389Sle				mtx_lock(&p->worklist_mtx);
299130389Sle				TAILQ_INSERT_TAIL(&p->worklist, wp,
300130389Sle				    list);
301130389Sle				mtx_unlock(&p->worklist_mtx);
302130389Sle				wakeup(&p);
303130389Sle				rcount = wp->length;
304130389Sle			}
305130389Sle
306130389Sle			boff += rcount;
307130389Sle			addr += rcount;
308130389Sle		}
309130389Sle		return;
310130389Sle
311130389Sle	default:
312130389Sle		g_io_deliver(bp, EOPNOTSUPP);
313130389Sle		return;
314130389Sle	}
315130389Sle}
316130389Sle
317130389Slestatic int
318130389Slegv_plex_access(struct g_provider *pp, int dr, int dw, int de)
319130389Sle{
320130389Sle	struct g_geom *gp;
321130389Sle	struct g_consumer *cp, *cp2;
322130389Sle	int error;
323130389Sle
324130389Sle	gp = pp->geom;
325130389Sle
326130389Sle	error = ENXIO;
327130389Sle	LIST_FOREACH(cp, &gp->consumer, consumer) {
328130389Sle		error = g_access(cp, dr, dw, de);
329130389Sle		if (error) {
330130389Sle			LIST_FOREACH(cp2, &gp->consumer, consumer) {
331130389Sle				if (cp == cp2)
332130389Sle					break;
333130389Sle				g_access(cp2, -dr, -dw, -de);
334130389Sle			}
335130389Sle			return (error);
336130389Sle		}
337130389Sle	}
338130389Sle	return (error);
339130389Sle}
340130389Sle
341130389Slestatic struct g_geom *
342130389Slegv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
343130389Sle{
344130389Sle	struct g_geom *gp;
345132906Sle	struct g_consumer *cp, *cp2;
346130389Sle	struct g_provider *pp2;
347130389Sle	struct gv_plex *p;
348130389Sle	struct gv_sd *s;
349130389Sle	struct gv_softc *sc;
350132906Sle	int error;
351130389Sle
352130389Sle	g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name);
353130389Sle	g_topology_assert();
354130389Sle
355130389Sle	/* We only want to attach to subdisks. */
356130389Sle	if (strcmp(pp->geom->class->name, "VINUMDRIVE"))
357130389Sle		return (NULL);
358130389Sle
359130389Sle	/* Find the VINUM class and its associated geom. */
360130389Sle	gp = find_vinum_geom();
361130389Sle	if (gp == NULL)
362130389Sle		return (NULL);
363130389Sle	sc = gp->softc;
364130389Sle	KASSERT(sc != NULL, ("gv_plex_taste: NULL sc"));
365130389Sle
366130389Sle	/* Find out which subdisk the offered provider corresponds to. */
367130389Sle	s = pp->private;
368130389Sle	KASSERT(s != NULL, ("gv_plex_taste: NULL s"));
369130389Sle
370130389Sle	/* Now find the correct plex where this subdisk belongs to. */
371130389Sle	p = gv_find_plex(sc, s->plex);
372130389Sle	KASSERT(p != NULL, ("gv_plex_taste: NULL p"));
373130389Sle
374130389Sle	/*
375130389Sle	 * Add this subdisk to this plex.  Since we trust the on-disk
376130389Sle	 * configuration, we don't check the given value (should we?).
377130389Sle	 * XXX: shouldn't be done here
378130389Sle	 */
379130389Sle	gv_sd_to_plex(p, s, 0);
380130389Sle
381130389Sle	/* Now check if there's already a geom for this plex. */
382130389Sle	gp = p->geom;
383130389Sle
384130389Sle	/* Yes, there is already a geom, so we just add the consumer. */
385130389Sle	if (gp != NULL) {
386132906Sle		cp2 = LIST_FIRST(&gp->consumer);
387130389Sle		/* Need to attach a new consumer to this subdisk. */
388130389Sle		cp = g_new_consumer(gp);
389132906Sle		error = g_attach(cp, pp);
390132906Sle		if (error) {
391132906Sle			printf("geom_vinum: couldn't attach consumer to %s\n",
392132906Sle			    pp->name);
393132906Sle			g_destroy_consumer(cp);
394132906Sle			return (NULL);
395132906Sle		}
396132906Sle		/* Adjust the access counts of the new consumer. */
397132906Sle		if ((cp2 != NULL) && (cp2->acr || cp2->acw || cp2->ace)) {
398132906Sle			error = g_access(cp, cp2->acr, cp2->acw, cp2->ace);
399132906Sle			if (error) {
400132906Sle				printf("geom_vinum: couldn't set access counts"
401132906Sle				    " for consumer on %s\n", pp->name);
402132906Sle				g_detach(cp);
403132906Sle				g_destroy_consumer(cp);
404132906Sle				return (NULL);
405132906Sle			}
406132906Sle		}
407130389Sle		s->consumer = cp;
408130389Sle
409130389Sle		/* Adjust the size of the providers this plex has. */
410130389Sle		LIST_FOREACH(pp2, &gp->provider, provider)
411130389Sle			pp2->mediasize = p->size;
412130389Sle
413132940Sle		/* Update the size of the volume this plex is attached to. */
414132940Sle		if (p->vol_sc != NULL)
415132940Sle			gv_update_vol_size(p->vol_sc, p->size);
416132940Sle
417130389Sle		return (NULL);
418130389Sle
419130389Sle	/* We need to create a new geom. */
420130389Sle	} else {
421130389Sle		gp = g_new_geomf(mp, "%s", p->name);
422130389Sle		gp->start = gv_plex_start;
423130389Sle		gp->orphan = gv_plex_orphan;
424130389Sle		gp->access = gv_plex_access;
425130389Sle		gp->softc = p;
426130389Sle		p->geom = gp;
427130389Sle
428130389Sle		/* RAID5 plexes need a 'worker' thread, where IO is handled. */
429130389Sle		if (p->org == GV_PLEX_RAID5) {
430130389Sle			TAILQ_INIT(&p->worklist);
431130389Sle			mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL,
432130389Sle			    MTX_DEF);
433130389Sle			p->flags &= ~GV_PLEX_THREAD_DIE;
434130389Sle			kthread_create(gv_raid5_worker, gp, NULL, 0, 0,
435130389Sle			    "gv_raid5");
436130389Sle			p->flags |= GV_PLEX_THREAD_ACTIVE;
437130389Sle		}
438130389Sle
439130389Sle		/* Attach a consumer to this provider. */
440130389Sle		cp = g_new_consumer(gp);
441130389Sle		g_attach(cp, pp);
442130389Sle		s->consumer = cp;
443130389Sle
444130389Sle		/* Create a provider for the outside world. */
445130389Sle		pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name);
446130389Sle		pp2->mediasize = p->size;
447130389Sle		pp2->sectorsize = pp->sectorsize;
448130389Sle		p->provider = pp2;
449130389Sle		g_error_provider(pp2, 0);
450130389Sle		return (gp);
451130389Sle	}
452130389Sle}
453130389Sle
454130389Slestatic int
455130389Slegv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp,
456130389Sle    struct g_geom *gp)
457130389Sle{
458130389Sle	struct gv_plex *p;
459130389Sle
460130389Sle	g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name);
461130389Sle	g_topology_assert();
462130389Sle
463130389Sle	p = gp->softc;
464130389Sle
465130389Sle	KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name));
466130389Sle
467130389Sle	/*
468130389Sle	 * If this is a RAID5 plex, check if its worker thread is still active
469130389Sle	 * and signal it to self destruct.
470130389Sle	 */
471130389Sle	gv_kill_thread(p);
472130389Sle	mtx_destroy(&p->worklist_mtx);
473130389Sle	/* g_free(sc); */
474130389Sle	g_wither_geom(gp, ENXIO);
475130389Sle	return (0);
476130389Sle}
477130389Sle
478130389Sle#define	VINUMPLEX_CLASS_NAME "VINUMPLEX"
479130389Sle
480130389Slestatic struct g_class g_vinum_plex_class = {
481130389Sle	.name = VINUMPLEX_CLASS_NAME,
482133318Sphk	.version = G_VERSION,
483130389Sle	.taste = gv_plex_taste,
484130389Sle	.destroy_geom = gv_plex_destroy_geom,
485130389Sle};
486130389Sle
487130389SleDECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex);
488