geom_vinum_plex.c revision 130389
1/*-
2 * Copyright (c) 2004 Lukas Ertl
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_plex.c 130389 2004-06-12 21:16:10Z le $");
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/kernel.h>
33#include <sys/kthread.h>
34#include <sys/libkern.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/module.h>
38#include <sys/mutex.h>
39#include <sys/systm.h>
40
41#include <geom/geom.h>
42#include <geom/vinum/geom_vinum_var.h>
43#include <geom/vinum/geom_vinum_raid5.h>
44#include <geom/vinum/geom_vinum.h>
45
46/* XXX: is this the place to catch dying subdisks? */
47static void
48gv_plex_orphan(struct g_consumer *cp)
49{
50	struct g_geom *gp;
51	struct gv_plex *p;
52	int error;
53
54	g_topology_assert();
55	gp = cp->geom;
56	g_trace(G_T_TOPOLOGY, "gv_plex_orphan(%s)", gp->name);
57
58	if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0)
59		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
60	error = cp->provider->error;
61	if (error == 0)
62		error = ENXIO;
63	g_detach(cp);
64	g_destroy_consumer(cp);
65	if (!LIST_EMPTY(&gp->consumer))
66		return;
67
68	p = gp->softc;
69	gv_kill_thread(p);
70	g_free(p);
71	g_wither_geom(gp, error);
72}
73
74static void
75gv_plex_done(struct bio *bp)
76{
77	struct g_geom *gp;
78	struct gv_sd *s;
79
80	gp = bp->bio_to->geom;
81
82	s = bp->bio_caller1;
83	KASSERT(s != NULL, ("gv_plex_done: NULL s"));
84
85	if (bp->bio_error == 0)
86		s->initialized += bp->bio_length;
87
88	if (s->initialized >= s->size) {
89		gv_set_sd_state(s, GV_SD_UP, 0);
90		s->initialized = 0;
91	}
92
93	g_std_done(bp);
94}
95
96/* Find the correct subdisk to send the bio to and build a bio to send. */
97static int
98gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
99    caddr_t addr, long bcount, off_t boff)
100{
101	struct g_geom *gp;
102	struct gv_plex *p;
103	struct gv_sd *s;
104	struct bio *cbp;
105	int i, sdno;
106	off_t len_left, real_len, real_off, stripeend, stripeno, stripestart;
107
108	s = NULL;
109
110	gp = bp->bio_to->geom;
111	p = gp->softc;
112
113	if (p == NULL || LIST_EMPTY(&p->subdisks))
114		return (ENXIO);
115
116	/*
117	 * We only handle concatenated and striped plexes here.  RAID5 plexes
118	 * are handled in build_raid5_request().
119	 */
120	switch (p->org) {
121	case GV_PLEX_CONCAT:
122		/*
123		 * Find the subdisk where this request starts.  The subdisks in
124		 * this list must be ordered by plex_offset.
125		 */
126		LIST_FOREACH(s, &p->subdisks, in_plex) {
127			if (s->plex_offset <= boff &&
128			    s->plex_offset + s->size > boff)
129				break;
130		}
131		/* Subdisk not found. */
132		if (s == NULL)
133			return (ENXIO);
134
135		/* Calculate corresponding offsets on disk. */
136		real_off = boff - s->plex_offset;
137		len_left = s->size - real_off;
138		real_len = (bcount > len_left) ? len_left : bcount;
139		break;
140
141	case GV_PLEX_STRIPED:
142		/* The number of the stripe where the request starts. */
143		stripeno = boff / p->stripesize;
144
145		/* The number of the subdisk where the stripe resides. */
146		sdno = stripeno % p->sdcount;
147
148		/* Find the right subdisk. */
149		i = 0;
150		LIST_FOREACH(s, &p->subdisks, in_plex) {
151			if (i == sdno)
152				break;
153			i++;
154		}
155
156		/* Subdisk not found. */
157		if (s == NULL)
158			return (ENXIO);
159
160		/* The offset of the stripe from the start of the subdisk. */
161		stripestart = (stripeno / p->sdcount) *
162		    p->stripesize;
163
164		/* The offset at the end of the stripe. */
165		stripeend = stripestart + p->stripesize;
166
167		/* The offset of the request on this subdisk. */
168		real_off = boff - (stripeno * p->stripesize) +
169		    stripestart;
170
171		/* The length left in this stripe. */
172		len_left = stripeend - real_off;
173
174		real_len = (bcount <= len_left) ? bcount : len_left;
175		break;
176
177	default:
178		return (EINVAL);
179	}
180
181	/* Now check if we can handle the request on this subdisk. */
182	switch (s->state) {
183	case GV_SD_UP:
184		/* If the subdisk is up, just continue. */
185		break;
186
187	case GV_SD_STALE:
188		if (bp->bio_caller1 != p)
189			return (ENXIO);
190
191		printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name);
192		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
193		break;
194
195	case GV_SD_INITIALIZING:
196		if (bp->bio_cmd == BIO_READ)
197			return (ENXIO);
198		break;
199
200	default:
201		/* All other subdisk states mean it's not accessible. */
202		return (ENXIO);
203	}
204
205	/* Clone the bio and adjust the offsets and sizes. */
206	cbp = g_clone_bio(bp);
207	if (cbp == NULL)
208		return (ENOMEM);
209	cbp->bio_offset = real_off;
210	cbp->bio_length = real_len;
211	cbp->bio_data = addr;
212	if (bp->bio_caller1 == p) {
213		cbp->bio_caller1 = s;
214		cbp->bio_done = gv_plex_done;
215	} else
216		cbp->bio_done = g_std_done;
217	*bp2 = cbp;
218	*cp = s->consumer;
219	return (0);
220}
221
222static void
223gv_plex_start(struct bio *bp)
224{
225	struct g_geom *gp;
226	struct g_consumer *cp;
227	struct gv_plex *p;
228	struct gv_raid5_packet *wp;
229	struct bio *bp2;
230	caddr_t addr;
231	off_t boff;
232	long bcount, rcount;
233	int err;
234
235	gp = bp->bio_to->geom;
236	p = gp->softc;
237
238	/*
239	 * We cannot handle this request if too many of our subdisks are
240	 * inaccessible.
241	 */
242	if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) {
243		g_io_deliver(bp, ENXIO);  /* XXX: correct way? */
244		return;
245	}
246
247	switch(bp->bio_cmd) {
248	case BIO_READ:
249	case BIO_WRITE:
250	case BIO_DELETE:
251		/*
252		 * We split up the request in smaller packets and hand them
253		 * down to our subdisks.
254		 */
255		wp = NULL;
256		addr = bp->bio_data;
257		boff = bp->bio_offset;
258		for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {
259			/*
260			 * RAID5 requests usually need to be split up in
261			 * several subrequests.
262			 */
263			if (p->org == GV_PLEX_RAID5) {
264				wp = gv_new_raid5_packet();
265				wp->bio = bp;
266				err = gv_build_raid5_req(wp, bp, addr, bcount,
267				    boff);
268			} else
269				err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount,
270				    boff);
271
272			if (err) {
273				bp->bio_completed += bcount;
274				if (bp->bio_error == 0)
275					bp->bio_error = err;
276				if (bp->bio_completed == bp->bio_length)
277					g_io_deliver(bp, bp->bio_error);
278				return;
279			}
280
281			if (p->org != GV_PLEX_RAID5) {
282				rcount = bp2->bio_length;
283				g_io_request(bp2, cp);
284
285			/*
286			 * RAID5 subrequests are queued on a worklist
287			 * and picked up from the worker thread.  This
288			 * ensures correct order.
289			 */
290			} else {
291				mtx_lock(&p->worklist_mtx);
292				TAILQ_INSERT_TAIL(&p->worklist, wp,
293				    list);
294				mtx_unlock(&p->worklist_mtx);
295				wakeup(&p);
296				rcount = wp->length;
297			}
298
299			boff += rcount;
300			addr += rcount;
301		}
302		return;
303
304	default:
305		g_io_deliver(bp, EOPNOTSUPP);
306		return;
307	}
308}
309
310static int
311gv_plex_access(struct g_provider *pp, int dr, int dw, int de)
312{
313	struct g_geom *gp;
314	struct g_consumer *cp, *cp2;
315	int error;
316
317	gp = pp->geom;
318
319	error = ENXIO;
320	LIST_FOREACH(cp, &gp->consumer, consumer) {
321		error = g_access(cp, dr, dw, de);
322		if (error) {
323			LIST_FOREACH(cp2, &gp->consumer, consumer) {
324				if (cp == cp2)
325					break;
326				g_access(cp2, -dr, -dw, -de);
327			}
328			return (error);
329		}
330	}
331	return (error);
332}
333
334static struct g_geom *
335gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
336{
337	struct g_geom *gp;
338	struct g_consumer *cp;
339	struct g_provider *pp2;
340	struct gv_plex *p;
341	struct gv_sd *s;
342	struct gv_softc *sc;
343
344	g_trace(G_T_TOPOLOGY, "gv_plex_taste(%s, %s)", mp->name, pp->name);
345	g_topology_assert();
346
347	/* We only want to attach to subdisks. */
348	if (strcmp(pp->geom->class->name, "VINUMDRIVE"))
349		return (NULL);
350
351	/* Find the VINUM class and its associated geom. */
352	gp = find_vinum_geom();
353	if (gp == NULL)
354		return (NULL);
355	sc = gp->softc;
356	KASSERT(sc != NULL, ("gv_plex_taste: NULL sc"));
357
358	/* Find out which subdisk the offered provider corresponds to. */
359	s = pp->private;
360	KASSERT(s != NULL, ("gv_plex_taste: NULL s"));
361
362	/* Now find the correct plex where this subdisk belongs to. */
363	p = gv_find_plex(sc, s->plex);
364	KASSERT(p != NULL, ("gv_plex_taste: NULL p"));
365
366	/*
367	 * Add this subdisk to this plex.  Since we trust the on-disk
368	 * configuration, we don't check the given value (should we?).
369	 * XXX: shouldn't be done here
370	 */
371	gv_sd_to_plex(p, s, 0);
372
373	/* Now check if there's already a geom for this plex. */
374	gp = p->geom;
375
376	/* Yes, there is already a geom, so we just add the consumer. */
377	if (gp != NULL) {
378		/* Need to attach a new consumer to this subdisk. */
379		cp = g_new_consumer(gp);
380		g_attach(cp, pp);
381		s->consumer = cp;
382
383		/* Adjust the size of the providers this plex has. */
384		LIST_FOREACH(pp2, &gp->provider, provider)
385			pp2->mediasize = p->size;
386
387		return (NULL);
388
389	/* We need to create a new geom. */
390	} else {
391		gp = g_new_geomf(mp, "%s", p->name);
392		gp->start = gv_plex_start;
393		gp->orphan = gv_plex_orphan;
394		gp->access = gv_plex_access;
395		gp->softc = p;
396		p->geom = gp;
397
398		/* RAID5 plexes need a 'worker' thread, where IO is handled. */
399		if (p->org == GV_PLEX_RAID5) {
400			TAILQ_INIT(&p->worklist);
401			mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL,
402			    MTX_DEF);
403			p->flags &= ~GV_PLEX_THREAD_DIE;
404			kthread_create(gv_raid5_worker, gp, NULL, 0, 0,
405			    "gv_raid5");
406			p->flags |= GV_PLEX_THREAD_ACTIVE;
407		}
408
409		/* Attach a consumer to this provider. */
410		cp = g_new_consumer(gp);
411		g_attach(cp, pp);
412		s->consumer = cp;
413
414		/* Create a provider for the outside world. */
415		pp2 = g_new_providerf(gp, "gvinum/plex/%s", p->name);
416		pp2->mediasize = p->size;
417		pp2->sectorsize = pp->sectorsize;
418		p->provider = pp2;
419		g_error_provider(pp2, 0);
420		return (gp);
421	}
422}
423
424static int
425gv_plex_destroy_geom(struct gctl_req *req, struct g_class *mp,
426    struct g_geom *gp)
427{
428	struct gv_plex *p;
429
430	g_trace(G_T_TOPOLOGY, "gv_plex_destroy_geom: %s", gp->name);
431	g_topology_assert();
432
433	p = gp->softc;
434
435	KASSERT(p != NULL, ("gv_plex_destroy_geom: null p of '%s'", gp->name));
436
437	/*
438	 * If this is a RAID5 plex, check if its worker thread is still active
439	 * and signal it to self destruct.
440	 */
441	gv_kill_thread(p);
442	mtx_destroy(&p->worklist_mtx);
443	/* g_free(sc); */
444	g_wither_geom(gp, ENXIO);
445	return (0);
446}
447
448#define	VINUMPLEX_CLASS_NAME "VINUMPLEX"
449
450static struct g_class g_vinum_plex_class = {
451	.name = VINUMPLEX_CLASS_NAME,
452	.taste = gv_plex_taste,
453	.destroy_geom = gv_plex_destroy_geom,
454};
455
456DECLARE_GEOM_CLASS(g_vinum_plex_class, g_vinum_plex);
457