1130389Sle/*-
2190507Slulf * Copyright (c) 2007 Lukas Ertl
3130389Sle * All rights reserved.
4130389Sle *
5130389Sle * Redistribution and use in source and binary forms, with or without
6130389Sle * modification, are permitted provided that the following conditions
7130389Sle * are met:
8130389Sle * 1. Redistributions of source code must retain the above copyright
9130389Sle *    notice, this list of conditions and the following disclaimer.
10130389Sle * 2. Redistributions in binary form must reproduce the above copyright
11130389Sle *    notice, this list of conditions and the following disclaimer in the
12130389Sle *    documentation and/or other materials provided with the distribution.
13130389Sle *
14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17130389Sle * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24130389Sle * SUCH DAMAGE.
25130389Sle */
26130389Sle
27130389Sle#include <sys/cdefs.h>
28130389Sle__FBSDID("$FreeBSD$");
29130389Sle
30130389Sle#include <sys/param.h>
31130389Sle#include <sys/bio.h>
32130389Sle#include <sys/lock.h>
33130389Sle#include <sys/malloc.h>
34130389Sle#include <sys/systm.h>
35130389Sle
36130389Sle#include <geom/geom.h>
37130389Sle#include <geom/vinum/geom_vinum_var.h>
38130389Sle#include <geom/vinum/geom_vinum.h>
39130389Sle
40190507Slulfvoid
41190507Slulfgv_volume_flush(struct gv_volume *v)
42130389Sle{
43190507Slulf	struct gv_softc *sc;
44135426Sle	struct bio *bp;
45130389Sle
46135426Sle	KASSERT(v != NULL, ("NULL v"));
47190507Slulf	sc = v->vinumconf;
48190507Slulf	KASSERT(sc != NULL, ("NULL sc"));
49135426Sle
50190507Slulf	bp = bioq_takefirst(v->wqueue);
51190507Slulf	while (bp != NULL) {
52190507Slulf		gv_volume_start(sc, bp);
53190507Slulf		bp = bioq_takefirst(v->wqueue);
54130389Sle	}
55135426Sle}
56135426Sle
57190507Slulfvoid
58190507Slulfgv_volume_start(struct gv_softc *sc, struct bio *bp)
59135426Sle{
60142301Sle	struct g_geom *gp;
61190507Slulf	struct gv_volume *v;
62190507Slulf	struct gv_plex *p, *lp;
63190507Slulf	int numwrites;
64135426Sle
65190507Slulf	gp = sc->geom;
66190507Slulf	v = bp->bio_to->private;
67190507Slulf	if (v == NULL || v->state != GV_VOL_UP) {
68190507Slulf		g_io_deliver(bp, ENXIO);
69142301Sle		return;
70135426Sle	}
71135426Sle
72135426Sle	switch (bp->bio_cmd) {
73135426Sle	case BIO_READ:
74142301Sle		/*
75190507Slulf		 * Try to find a good plex where we can send the request to,
76190507Slulf		 * round-robin-style.  The plex either has to be up, or it's a
77190507Slulf		 * degraded RAID5 plex. Check if we have delayed requests. Put
78190507Slulf		 * this request on the delayed queue if so. This makes sure that
79190507Slulf		 * we don't read old values.
80142301Sle		 */
81190507Slulf		if (bioq_first(v->wqueue) != NULL) {
82190507Slulf			bioq_insert_tail(v->wqueue, bp);
83190507Slulf			break;
84190507Slulf		}
85148048Sle		lp = v->last_read_plex;
86148048Sle		if (lp == NULL)
87148048Sle			lp = LIST_FIRST(&v->plexes);
88148048Sle		p = LIST_NEXT(lp, in_volume);
89190507Slulf		if (p == NULL)
90190507Slulf			p = LIST_FIRST(&v->plexes);
91148048Sle		do {
92190507Slulf			if (p == NULL) {
93190507Slulf				p = lp;
94190507Slulf				break;
95190507Slulf			}
96142301Sle			if ((p->state > GV_PLEX_DEGRADED) ||
97142301Sle			    (p->state >= GV_PLEX_DEGRADED &&
98142301Sle			    p->org == GV_PLEX_RAID5))
99135426Sle				break;
100148048Sle			p = LIST_NEXT(p, in_volume);
101190507Slulf			if (p == NULL)
102190507Slulf				p = LIST_FIRST(&v->plexes);
103148048Sle		} while (p != lp);
104148048Sle
105190507Slulf		if ((p == NULL) ||
106148048Sle		    (p->org == GV_PLEX_RAID5 && p->state < GV_PLEX_DEGRADED) ||
107149379Sle		    (p->org != GV_PLEX_RAID5 && p->state <= GV_PLEX_DEGRADED)) {
108142301Sle			g_io_deliver(bp, ENXIO);
109142301Sle			return;
110142301Sle		}
111148048Sle		v->last_read_plex = p;
112135426Sle
113190507Slulf		/* Hand it down to the plex logic. */
114190507Slulf		gv_plex_start(p, bp);
115135426Sle		break;
116135426Sle
117135426Sle	case BIO_WRITE:
118135426Sle	case BIO_DELETE:
119190507Slulf		/* Delay write-requests if any plex is synchronizing. */
120135426Sle		LIST_FOREACH(p, &v->plexes, in_volume) {
121190507Slulf			if (p->flags & GV_PLEX_SYNCING) {
122190507Slulf				bioq_insert_tail(v->wqueue, bp);
123148048Sle				return;
124148048Sle			}
125135426Sle		}
126190507Slulf
127190507Slulf		numwrites = 0;
128190507Slulf		/* Give the BIO to each plex of this volume. */
129190507Slulf		LIST_FOREACH(p, &v->plexes, in_volume) {
130190507Slulf			if (p->state < GV_PLEX_DEGRADED)
131190507Slulf				continue;
132190507Slulf			gv_plex_start(p, bp);
133190507Slulf			numwrites++;
134135426Sle		}
135190507Slulf		if (numwrites == 0)
136190507Slulf			g_io_deliver(bp, ENXIO);
137135426Sle		break;
138130389Sle	}
139130389Sle}
140130389Sle
141190507Slulfvoid
142190507Slulfgv_bio_done(struct gv_softc *sc, struct bio *bp)
143130389Sle{
144130389Sle	struct gv_volume *v;
145130389Sle	struct gv_plex *p;
146190507Slulf	struct gv_sd *s;
147130389Sle
148190507Slulf	s = bp->bio_caller1;
149190507Slulf	KASSERT(s != NULL, ("gv_bio_done: NULL s"));
150190507Slulf	p = s->plex_sc;
151190507Slulf	KASSERT(p != NULL, ("gv_bio_done: NULL p"));
152190507Slulf	v = p->vol_sc;
153190507Slulf	KASSERT(v != NULL, ("gv_bio_done: NULL v"));
154130389Sle
155190507Slulf	switch (p->org) {
156190507Slulf	case GV_PLEX_CONCAT:
157190507Slulf	case GV_PLEX_STRIPED:
158190507Slulf		gv_plex_normal_done(p, bp);
159190507Slulf		break;
160190507Slulf	case GV_PLEX_RAID5:
161190507Slulf		gv_plex_raid5_done(p, bp);
162190507Slulf		break;
163154075Sle	}
164130389Sle}
165