geom_vinum_drive.c revision 184292
1130389Sle/*-
2142020Sle * Copyright (c) 2004, 2005 Lukas Ertl
3130389Sle * All rights reserved.
4130389Sle *
5130389Sle * Redistribution and use in source and binary forms, with or without
6130389Sle * modification, are permitted provided that the following conditions
7130389Sle * are met:
8130389Sle * 1. Redistributions of source code must retain the above copyright
9130389Sle *    notice, this list of conditions and the following disclaimer.
10130389Sle * 2. Redistributions in binary form must reproduce the above copyright
11130389Sle *    notice, this list of conditions and the following disclaimer in the
12130389Sle *    documentation and/or other materials provided with the distribution.
13130389Sle *
14130389Sle * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15130389Sle * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16130389Sle * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17130389Sle * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18130389Sle * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19130389Sle * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20130389Sle * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21130389Sle * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22130389Sle * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23130389Sle * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24130389Sle * SUCH DAMAGE.
25130389Sle */
26130389Sle
27130389Sle#include <sys/cdefs.h>
28130389Sle__FBSDID("$FreeBSD: head/sys/geom/vinum/geom_vinum_drive.c 184292 2008-10-26 17:20:37Z lulf $");
29130389Sle
30130389Sle#include <sys/param.h>
31130389Sle#include <sys/bio.h>
32130389Sle#include <sys/errno.h>
33183514Slulf#include <sys/endian.h>
34130389Sle#include <sys/conf.h>
35130389Sle#include <sys/kernel.h>
36130389Sle#include <sys/kthread.h>
37130389Sle#include <sys/libkern.h>
38130389Sle#include <sys/lock.h>
39130389Sle#include <sys/malloc.h>
40130389Sle#include <sys/module.h>
41130389Sle#include <sys/mutex.h>
42130389Sle#include <sys/sbuf.h>
43130389Sle#include <sys/systm.h>
44130389Sle#include <sys/time.h>
45181803Sbz#include <sys/vimage.h>
46130389Sle
47130389Sle#include <geom/geom.h>
48130389Sle#include <geom/vinum/geom_vinum_var.h>
49130389Sle#include <geom/vinum/geom_vinum.h>
50130389Sle#include <geom/vinum/geom_vinum_share.h>
51130389Sle
52183514Slulf#define GV_LEGACY_I386	0
53183514Slulf#define GV_LEGACY_AMD64 1
54183514Slulf#define GV_LEGACY_SPARC64 2
55183514Slulf#define GV_LEGACY_POWERPC 3
56183514Slulf
57146325Slestatic void	gv_drive_dead(void *, int);
58135173Slestatic void	gv_drive_worker(void *);
59183514Slulfstatic int	gv_legacy_header_type(uint8_t *, int);
60130389Sle
61183514Slulf/*
62183514Slulf * Here are the "offset (size)" for the various struct gv_hdr fields,
63183514Slulf * for the legacy i386 (or 32-bit powerpc), legacy amd64 (or sparc64), and
64183514Slulf * current (cpu & endian agnostic) versions of the on-disk format of the vinum
65183514Slulf * header structure:
66183514Slulf *
67183514Slulf *       i386    amd64   current   field
68183514Slulf *     -------- -------- --------  -----
69183514Slulf *       0 ( 8)   0 ( 8)   0 ( 8)  magic
70183514Slulf *       8 ( 4)   8 ( 8)   8 ( 8)  config_length
71183514Slulf *      12 (32)  16 (32)  16 (32)  label.sysname
72183514Slulf *      44 (32)  48 (32)  48 (32)  label.name
73183514Slulf *      76 ( 4)  80 ( 8)  80 ( 8)  label.date_of_birth.tv_sec
74183514Slulf *      80 ( 4)  88 ( 8)  88 ( 8)  label.date_of_birth.tv_usec
75183514Slulf *      84 ( 4)  96 ( 8)  96 ( 8)  label.last_update.tv_sec
76183514Slulf *      88 ( 4) 104 ( 8) 104 ( 8)  label.last_update.tv_usec
77183514Slulf *      92 ( 8) 112 ( 8) 112 ( 8)  label.drive_size
78183514Slulf *     ======== ======== ========
79183514Slulf *     100      120      120       total size
80183514Slulf *
81183514Slulf * NOTE: i386 and amd64 formats are stored as little-endian; the current
82183514Slulf * format uses big-endian (network order).
83183514Slulf */
84183514Slulf
85183514Slulf
86183514Slulf/* Checks for legacy format depending on platform. */
87183514Slulfstatic int
88183514Slulfgv_legacy_header_type(uint8_t *hdr, int bigendian)
89183514Slulf{
90183514Slulf	uint32_t *i32;
91183514Slulf	int arch_32, arch_64, i;
92183514Slulf
93183514Slulf	/* Set arch according to endianess. */
94183514Slulf	if (bigendian) {
95183514Slulf		arch_32 = GV_LEGACY_POWERPC;
96183514Slulf		arch_64 = GV_LEGACY_SPARC64;
97183514Slulf	} else {
98183514Slulf		arch_32 = GV_LEGACY_I386;
99183514Slulf		arch_64 = GV_LEGACY_AMD64;
100183514Slulf	}
101183514Slulf
102183514Slulf	/* if non-empty hostname overlaps 64-bit config_length */
103183514Slulf	i32 = (uint32_t *)(hdr + 12);
104183514Slulf	if (*i32 != 0)
105183514Slulf		return (arch_32);
106183514Slulf	/* check for non-empty hostname */
107183514Slulf	if (hdr[16] != 0)
108183514Slulf		return (arch_64);
109183514Slulf	/* check bytes past 32-bit structure */
110183514Slulf	for (i = 100; i < 120; i++)
111183514Slulf		if (hdr[i] != 0)
112183514Slulf			return (arch_32);
113183514Slulf	/* check for overlapping timestamp */
114183514Slulf	i32 = (uint32_t *)(hdr + 84);
115183514Slulf
116183514Slulf	if (*i32 == 0)
117183514Slulf		return (arch_64);
118183514Slulf	return (arch_32);
119183514Slulf}
120183514Slulf
121183514Slulf/*
122183514Slulf * Read the header while taking magic number into account, and write it to
123183514Slulf * destination pointer.
124183514Slulf */
125183514Slulfint
126183514Slulfgv_read_header(struct g_consumer *cp, struct gv_hdr *m_hdr)
127183514Slulf{
128183514Slulf	struct g_provider *pp;
129183514Slulf	uint64_t magic_machdep;
130183514Slulf	uint8_t *d_hdr;
131183514Slulf	int be, off;
132183514Slulf
133183514Slulf#define GV_GET32(endian)					\
134183514Slulf		endian##32toh(*((uint32_t *)&d_hdr[off]));	\
135183514Slulf		off += 4
136183514Slulf#define GV_GET64(endian)					\
137183514Slulf		endian##64toh(*((uint64_t *)&d_hdr[off]));	\
138183514Slulf		off += 8
139183514Slulf
140183514Slulf	KASSERT(m_hdr != NULL, ("gv_read_header: null m_hdr"));
141183514Slulf	KASSERT(cp != NULL, ("gv_read_header: null cp"));
142183514Slulf	pp = cp->provider;
143183514Slulf	KASSERT(pp != NULL, ("gv_read_header: null pp"));
144183514Slulf
145183514Slulf	d_hdr = g_read_data(cp, GV_HDR_OFFSET, pp->sectorsize, NULL);
146183514Slulf	if (d_hdr == NULL)
147183514Slulf		return (-1);
148183514Slulf	off = 0;
149183514Slulf	m_hdr->magic = GV_GET64(be);
150183514Slulf	magic_machdep = *((uint64_t *)&d_hdr[0]);
151183514Slulf	/*
152183514Slulf	 * The big endian machines will have a reverse of GV_OLD_MAGIC, so we
153183514Slulf	 * need to decide if we are running on a big endian machine as well as
154183514Slulf	 * checking the magic against the reverse of GV_OLD_MAGIC.
155183514Slulf	 */
156183514Slulf	be = (m_hdr->magic == magic_machdep);
157183514Slulf	if (m_hdr->magic == GV_MAGIC) {
158183514Slulf		m_hdr->config_length = GV_GET64(be);
159183514Slulf		off = 16;
160183514Slulf		bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN);
161183514Slulf		off += GV_HOSTNAME_LEN;
162183514Slulf		bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME);
163183514Slulf		off += GV_MAXDRIVENAME;
164183514Slulf		m_hdr->label.date_of_birth.tv_sec = GV_GET64(be);
165183514Slulf		m_hdr->label.date_of_birth.tv_usec = GV_GET64(be);
166183514Slulf		m_hdr->label.last_update.tv_sec = GV_GET64(be);
167183514Slulf		m_hdr->label.last_update.tv_usec = GV_GET64(be);
168183514Slulf		m_hdr->label.drive_size = GV_GET64(be);
169183514Slulf	} else if (m_hdr->magic != GV_OLD_MAGIC &&
170183514Slulf	    m_hdr->magic != le64toh(GV_OLD_MAGIC)) {
171183514Slulf		/* Not a gvinum drive. */
172183514Slulf		g_free(d_hdr);
173183514Slulf		return (-1);
174183514Slulf	} else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_SPARC64) {
175184292Slulf		G_VINUM_DEBUG(1, "detected legacy sparc64 header");
176183514Slulf		m_hdr->magic = GV_MAGIC;
177183514Slulf		/* Legacy sparc64 on-disk header */
178183514Slulf		m_hdr->config_length = GV_GET64(be);
179183514Slulf		bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN);
180183514Slulf		off += GV_HOSTNAME_LEN;
181183514Slulf		bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME);
182183514Slulf		off += GV_MAXDRIVENAME;
183183514Slulf		m_hdr->label.date_of_birth.tv_sec = GV_GET64(be);
184183514Slulf		m_hdr->label.date_of_birth.tv_usec = GV_GET64(be);
185183514Slulf		m_hdr->label.last_update.tv_sec = GV_GET64(be);
186183514Slulf		m_hdr->label.last_update.tv_usec = GV_GET64(be);
187183514Slulf		m_hdr->label.drive_size = GV_GET64(be);
188183514Slulf	} else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_POWERPC) {
189184292Slulf		G_VINUM_DEBUG(1, "detected legacy PowerPC header");
190183514Slulf		m_hdr->magic = GV_MAGIC;
191183514Slulf		/* legacy 32-bit big endian on-disk header */
192183514Slulf		m_hdr->config_length = GV_GET32(be);
193183514Slulf		bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN);
194183514Slulf		off += GV_HOSTNAME_LEN;
195183514Slulf		bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME);
196183514Slulf		off += GV_MAXDRIVENAME;
197183514Slulf		m_hdr->label.date_of_birth.tv_sec = GV_GET32(be);
198183514Slulf		m_hdr->label.date_of_birth.tv_usec = GV_GET32(be);
199183514Slulf		m_hdr->label.last_update.tv_sec = GV_GET32(be);
200183514Slulf		m_hdr->label.last_update.tv_usec = GV_GET32(be);
201183514Slulf		m_hdr->label.drive_size = GV_GET64(be);
202183514Slulf	} else if (gv_legacy_header_type(d_hdr, be) == GV_LEGACY_I386) {
203184292Slulf		G_VINUM_DEBUG(1, "detected legacy i386 header");
204183514Slulf		m_hdr->magic = GV_MAGIC;
205183514Slulf		/* legacy i386 on-disk header */
206183514Slulf		m_hdr->config_length = GV_GET32(le);
207183514Slulf		bcopy(d_hdr + off, m_hdr->label.sysname, GV_HOSTNAME_LEN);
208183514Slulf		off += GV_HOSTNAME_LEN;
209183514Slulf		bcopy(d_hdr + off, m_hdr->label.name, GV_MAXDRIVENAME);
210183514Slulf		off += GV_MAXDRIVENAME;
211183514Slulf		m_hdr->label.date_of_birth.tv_sec = GV_GET32(le);
212183514Slulf		m_hdr->label.date_of_birth.tv_usec = GV_GET32(le);
213183514Slulf		m_hdr->label.last_update.tv_sec = GV_GET32(le);
214183514Slulf		m_hdr->label.last_update.tv_usec = GV_GET32(le);
215183514Slulf		m_hdr->label.drive_size = GV_GET64(le);
216183514Slulf	} else {
217184292Slulf		G_VINUM_DEBUG(1, "detected legacy amd64 header");
218183514Slulf		m_hdr->magic = GV_MAGIC;
219183514Slulf		/* legacy amd64 on-disk header */
220183514Slulf		m_hdr->config_length = GV_GET64(le);
221183514Slulf		bcopy(d_hdr + 16, m_hdr->label.sysname, GV_HOSTNAME_LEN);
222183514Slulf		off += GV_HOSTNAME_LEN;
223183514Slulf		bcopy(d_hdr + 48, m_hdr->label.name, GV_MAXDRIVENAME);
224183514Slulf		off += GV_MAXDRIVENAME;
225183514Slulf		m_hdr->label.date_of_birth.tv_sec = GV_GET64(le);
226183514Slulf		m_hdr->label.date_of_birth.tv_usec = GV_GET64(le);
227183514Slulf		m_hdr->label.last_update.tv_sec = GV_GET64(le);
228183514Slulf		m_hdr->label.last_update.tv_usec = GV_GET64(le);
229183514Slulf		m_hdr->label.drive_size = GV_GET64(le);
230183514Slulf	}
231183514Slulf
232183514Slulf	g_free(d_hdr);
233183514Slulf	return (0);
234183514Slulf}
235183514Slulf
236183514Slulf/* Write out the gvinum header. */
237183514Slulfint
238183514Slulfgv_write_header(struct g_consumer *cp, struct gv_hdr *m_hdr)
239183514Slulf{
240183514Slulf	uint8_t d_hdr[GV_HDR_LEN];
241183514Slulf	int off, ret;
242183514Slulf
243183514Slulf#define GV_SET64BE(field)					\
244183514Slulf	do {							\
245183514Slulf		*((uint64_t *)&d_hdr[off]) = htobe64(field);	\
246183514Slulf		off += 8;					\
247183514Slulf	} while (0)
248183514Slulf
249183514Slulf	KASSERT(m_hdr != NULL, ("gv_write_header: null m_hdr"));
250183514Slulf
251183514Slulf	off = 0;
252183514Slulf	memset(d_hdr, 0, GV_HDR_LEN);
253183514Slulf	GV_SET64BE(m_hdr->magic);
254183514Slulf	GV_SET64BE(m_hdr->config_length);
255183514Slulf	off = 16;
256183514Slulf	bcopy(m_hdr->label.sysname, d_hdr + off, GV_HOSTNAME_LEN);
257183514Slulf	off += GV_HOSTNAME_LEN;
258183514Slulf	bcopy(m_hdr->label.name, d_hdr + off, GV_MAXDRIVENAME);
259183514Slulf	off += GV_MAXDRIVENAME;
260183514Slulf	GV_SET64BE(m_hdr->label.date_of_birth.tv_sec);
261183514Slulf	GV_SET64BE(m_hdr->label.date_of_birth.tv_usec);
262183514Slulf	GV_SET64BE(m_hdr->label.last_update.tv_sec);
263183514Slulf	GV_SET64BE(m_hdr->label.last_update.tv_usec);
264183514Slulf	GV_SET64BE(m_hdr->label.drive_size);
265183514Slulf
266183514Slulf	ret = g_write_data(cp, GV_HDR_OFFSET, d_hdr, GV_HDR_LEN);
267183514Slulf	return (ret);
268183514Slulf}
269183514Slulf
270130389Slevoid
271134407Slegv_config_new_drive(struct gv_drive *d)
272134407Sle{
273134407Sle	struct gv_hdr *vhdr;
274134407Sle	struct gv_freelist *fl;
275134407Sle
276134407Sle	KASSERT(d != NULL, ("config_new_drive: NULL d"));
277134407Sle
278134407Sle	vhdr = g_malloc(sizeof(*vhdr), M_WAITOK | M_ZERO);
279134407Sle	vhdr->magic = GV_MAGIC;
280134407Sle	vhdr->config_length = GV_CFG_LEN;
281134407Sle
282180291Srwatson	mtx_lock(&hostname_mtx);
283181803Sbz	bcopy(G_hostname, vhdr->label.sysname, GV_HOSTNAME_LEN);
284180291Srwatson	mtx_unlock(&hostname_mtx);
285134407Sle	strncpy(vhdr->label.name, d->name, GV_MAXDRIVENAME);
286134407Sle	microtime(&vhdr->label.date_of_birth);
287134407Sle
288134407Sle	d->hdr = vhdr;
289134407Sle
290134407Sle	LIST_INIT(&d->subdisks);
291134407Sle	LIST_INIT(&d->freelist);
292134407Sle
293134407Sle	fl = g_malloc(sizeof(struct gv_freelist), M_WAITOK | M_ZERO);
294134407Sle	fl->offset = GV_DATA_START;
295134407Sle	fl->size = d->avail;
296134407Sle	LIST_INSERT_HEAD(&d->freelist, fl, freelist);
297134407Sle	d->freelist_entries = 1;
298135173Sle
299154075Sle	d->bqueue = g_malloc(sizeof(struct bio_queue_head), M_WAITOK | M_ZERO);
300154075Sle	bioq_init(d->bqueue);
301135173Sle	mtx_init(&d->bqueue_mtx, "gv_drive", NULL, MTX_DEF);
302172836Sjulian	kproc_create(gv_drive_worker, d, NULL, 0, 0, "gv_d %s", d->name);
303135173Sle	d->flags |= GV_DRIVE_THREAD_ACTIVE;
304134407Sle}
305134407Sle
306134407Slevoid
307130389Slegv_save_config_all(struct gv_softc *sc)
308130389Sle{
309130389Sle	struct gv_drive *d;
310130389Sle
311130389Sle	g_topology_assert();
312130389Sle
313130389Sle	LIST_FOREACH(d, &sc->drives, drive) {
314130389Sle		if (d->geom == NULL)
315130389Sle			continue;
316130389Sle		gv_save_config(NULL, d, sc);
317130389Sle	}
318130389Sle}
319130389Sle
320130389Sle/* Save the vinum configuration back to disk. */
321130389Slevoid
322130389Slegv_save_config(struct g_consumer *cp, struct gv_drive *d, struct gv_softc *sc)
323130389Sle{
324130389Sle	struct g_geom *gp;
325130389Sle	struct g_consumer *cp2;
326130389Sle	struct gv_hdr *vhdr, *hdr;
327130389Sle	struct sbuf *sb;
328130389Sle	int error;
329130389Sle
330130389Sle	g_topology_assert();
331130389Sle
332130389Sle	KASSERT(d != NULL, ("gv_save_config: null d"));
333130389Sle	KASSERT(sc != NULL, ("gv_save_config: null sc"));
334130389Sle
335149094Sle	/*
336149094Sle	 * We can't save the config on a drive that isn't up, but drives that
337149094Sle	 * were just created aren't officially up yet, so we check a special
338149094Sle	 * flag.
339149094Sle	 */
340149094Sle	if ((d->state != GV_DRIVE_UP) && !(d->flags && GV_DRIVE_NEWBORN))
341146325Sle		return;
342146325Sle
343130389Sle	if (cp == NULL) {
344130389Sle		gp = d->geom;
345130389Sle		KASSERT(gp != NULL, ("gv_save_config: null gp"));
346130389Sle		cp2 = LIST_FIRST(&gp->consumer);
347130389Sle		KASSERT(cp2 != NULL, ("gv_save_config: null cp2"));
348130389Sle	} else
349130389Sle		cp2 = cp;
350130389Sle
351130389Sle	vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO);
352130389Sle	vhdr->magic = GV_MAGIC;
353130389Sle	vhdr->config_length = GV_CFG_LEN;
354130389Sle
355130389Sle	hdr = d->hdr;
356130389Sle	if (hdr == NULL) {
357184292Slulf		G_VINUM_DEBUG(0, "drive %s has NULL hdr", d->name);
358130389Sle		g_free(vhdr);
359130389Sle		return;
360130389Sle	}
361130389Sle	microtime(&hdr->label.last_update);
362130389Sle	bcopy(&hdr->label, &vhdr->label, sizeof(struct gv_label));
363130389Sle
364130389Sle	sb = sbuf_new(NULL, NULL, GV_CFG_LEN, SBUF_FIXEDLEN);
365130389Sle	gv_format_config(sc, sb, 1, NULL);
366130389Sle	sbuf_finish(sb);
367130389Sle
368130389Sle	error = g_access(cp2, 0, 1, 0);
369130389Sle	if (error) {
370184292Slulf		G_VINUM_DEBUG(0, "g_access failed on drive %s, errno %d",
371146325Sle		    d->name, error);
372130389Sle		sbuf_delete(sb);
373146325Sle		g_free(vhdr);
374130389Sle		return;
375130389Sle	}
376130389Sle	g_topology_unlock();
377130389Sle
378130389Sle	do {
379183514Slulf		error = gv_write_header(cp2, vhdr);
380130389Sle		if (error) {
381184292Slulf			G_VINUM_DEBUG(0, "writing vhdr failed on drive %s, "
382146325Sle			    "errno %d", d->name, error);
383130389Sle			break;
384130389Sle		}
385130389Sle
386130389Sle		error = g_write_data(cp2, GV_CFG_OFFSET, sbuf_data(sb),
387130389Sle		    GV_CFG_LEN);
388130389Sle		if (error) {
389184292Slulf			G_VINUM_DEBUG(0, "writing first config copy failed "
390146325Sle			    "on drive %s, errno %d", d->name, error);
391130389Sle			break;
392130389Sle		}
393130389Sle
394130389Sle		error = g_write_data(cp2, GV_CFG_OFFSET + GV_CFG_LEN,
395130389Sle		    sbuf_data(sb), GV_CFG_LEN);
396130389Sle		if (error)
397184292Slulf			G_VINUM_DEBUG(0, "writing second config copy failed "
398146325Sle			    "on drive %s, errno %d", d->name, error);
399130389Sle	} while (0);
400130389Sle
401130389Sle	g_topology_lock();
402130389Sle	g_access(cp2, 0, -1, 0);
403130389Sle	sbuf_delete(sb);
404130389Sle	g_free(vhdr);
405130389Sle
406130389Sle	if (d->geom != NULL)
407130389Sle		gv_drive_modify(d);
408130389Sle}
409130389Sle
410130389Sle/* This resembles g_slice_access(). */
411130389Slestatic int
412130389Slegv_drive_access(struct g_provider *pp, int dr, int dw, int de)
413130389Sle{
414130389Sle	struct g_geom *gp;
415130389Sle	struct g_consumer *cp;
416130389Sle	struct g_provider *pp2;
417130389Sle	struct gv_drive *d;
418130389Sle	struct gv_sd *s, *s2;
419130389Sle	int error;
420130389Sle
421130389Sle	gp = pp->geom;
422130389Sle	cp = LIST_FIRST(&gp->consumer);
423135173Sle	if (cp == NULL)
424135173Sle		return (0);
425130389Sle
426130389Sle	d = gp->softc;
427146325Sle	if (d == NULL)
428146325Sle		return (0);
429130389Sle
430130389Sle	s = pp->private;
431130389Sle	KASSERT(s != NULL, ("gv_drive_access: NULL s"));
432130389Sle
433130389Sle	LIST_FOREACH(s2, &d->subdisks, from_drive) {
434130389Sle		if (s == s2)
435130389Sle			continue;
436130389Sle		if (s->drive_offset + s->size <= s2->drive_offset)
437130389Sle			continue;
438130389Sle		if (s2->drive_offset + s2->size <= s->drive_offset)
439130389Sle			continue;
440130389Sle
441130389Sle		/* Overlap. */
442130389Sle		pp2 = s2->provider;
443130389Sle		KASSERT(s2 != NULL, ("gv_drive_access: NULL s2"));
444146325Sle		if ((pp->acw + dw) > 0 && pp2->ace > 0)
445130389Sle			return (EPERM);
446146325Sle		if ((pp->ace + de) > 0 && pp2->acw > 0)
447130389Sle			return (EPERM);
448130389Sle	}
449130389Sle
450130389Sle	error = g_access(cp, dr, dw, de);
451130389Sle	return (error);
452130389Sle}
453130389Sle
454130389Slestatic void
455135173Slegv_drive_done(struct bio *bp)
456135173Sle{
457135173Sle	struct gv_drive *d;
458135173Sle
459135173Sle	/* Put the BIO on the worker queue again. */
460135173Sle	d = bp->bio_from->geom->softc;
461135173Sle	bp->bio_cflags |= GV_BIO_DONE;
462135173Sle	mtx_lock(&d->bqueue_mtx);
463154075Sle	bioq_insert_tail(d->bqueue, bp);
464135173Sle	wakeup(d);
465135173Sle	mtx_unlock(&d->bqueue_mtx);
466135173Sle}
467135173Sle
468135173Sle
469135173Slestatic void
470130389Slegv_drive_start(struct bio *bp)
471130389Sle{
472130389Sle	struct gv_drive *d;
473130389Sle	struct gv_sd *s;
474130389Sle
475135173Sle	switch (bp->bio_cmd) {
476135173Sle	case BIO_READ:
477135173Sle	case BIO_WRITE:
478135173Sle	case BIO_DELETE:
479135173Sle		break;
480135173Sle	case BIO_GETATTR:
481135173Sle	default:
482135173Sle		g_io_deliver(bp, EOPNOTSUPP);
483135173Sle		return;
484135173Sle	}
485130389Sle
486135173Sle	s = bp->bio_to->private;
487130389Sle	if ((s->state == GV_SD_DOWN) || (s->state == GV_SD_STALE)) {
488130389Sle		g_io_deliver(bp, ENXIO);
489130389Sle		return;
490130389Sle	}
491130389Sle
492135173Sle	d = bp->bio_to->geom->softc;
493130389Sle
494135173Sle	/*
495135173Sle	 * Put the BIO on the worker queue, where the worker thread will pick
496135173Sle	 * it up.
497135173Sle	 */
498135173Sle	mtx_lock(&d->bqueue_mtx);
499154075Sle	bioq_disksort(d->bqueue, bp);
500135173Sle	wakeup(d);
501135173Sle	mtx_unlock(&d->bqueue_mtx);
502130389Sle
503135173Sle}
504135173Sle
505135173Slestatic void
506135173Slegv_drive_worker(void *arg)
507135173Sle{
508135173Sle	struct bio *bp, *cbp;
509135173Sle	struct g_geom *gp;
510135173Sle	struct g_provider *pp;
511135173Sle	struct gv_drive *d;
512135173Sle	struct gv_sd *s;
513135173Sle	int error;
514135173Sle
515135173Sle	d = arg;
516135173Sle
517135173Sle	mtx_lock(&d->bqueue_mtx);
518135173Sle	for (;;) {
519135173Sle		/* We were signaled to exit. */
520135173Sle		if (d->flags & GV_DRIVE_THREAD_DIE)
521135173Sle			break;
522135173Sle
523135173Sle		/* Take the first BIO from out queue. */
524154075Sle		bp = bioq_takefirst(d->bqueue);
525154075Sle		if (bp == NULL) {
526135173Sle			msleep(d, &d->bqueue_mtx, PRIBIO, "-", hz/10);
527135173Sle			continue;
528135173Sle 		}
529135173Sle		mtx_unlock(&d->bqueue_mtx);
530135173Sle
531135173Sle		pp = bp->bio_to;
532135173Sle		gp = pp->geom;
533135173Sle
534135173Sle		/* Completed request. */
535135173Sle		if (bp->bio_cflags & GV_BIO_DONE) {
536135173Sle			error = bp->bio_error;
537135173Sle
538135173Sle			/* Deliver the original request. */
539135173Sle			g_std_done(bp);
540135173Sle
541135173Sle			/* The request had an error, we need to clean up. */
542135173Sle			if (error != 0) {
543135173Sle				g_topology_lock();
544135173Sle				gv_set_drive_state(d, GV_DRIVE_DOWN,
545135173Sle				    GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG);
546135173Sle				g_topology_unlock();
547146325Sle				g_post_event(gv_drive_dead, d, M_WAITOK, d,
548146325Sle				    NULL);
549135173Sle			}
550135173Sle
551135173Sle		/* New request, needs to be sent downwards. */
552135173Sle		} else {
553135173Sle			s = pp->private;
554135173Sle
555135173Sle			if ((s->state == GV_SD_DOWN) ||
556135173Sle			    (s->state == GV_SD_STALE)) {
557135173Sle				g_io_deliver(bp, ENXIO);
558135173Sle				mtx_lock(&d->bqueue_mtx);
559135173Sle				continue;
560135173Sle			}
561135173Sle			if (bp->bio_offset > s->size) {
562135173Sle				g_io_deliver(bp, EINVAL);
563135173Sle				mtx_lock(&d->bqueue_mtx);
564135173Sle				continue;
565135173Sle			}
566135173Sle
567135173Sle			cbp = g_clone_bio(bp);
568135173Sle			if (cbp == NULL) {
569135173Sle				g_io_deliver(bp, ENOMEM);
570135173Sle				mtx_lock(&d->bqueue_mtx);
571135173Sle				continue;
572135173Sle			}
573135173Sle			if (cbp->bio_offset + cbp->bio_length > s->size)
574135173Sle				cbp->bio_length = s->size -
575135173Sle				    cbp->bio_offset;
576135173Sle			cbp->bio_done = gv_drive_done;
577135173Sle			cbp->bio_offset += s->drive_offset;
578135173Sle			g_io_request(cbp, LIST_FIRST(&gp->consumer));
579130389Sle		}
580130389Sle
581135173Sle		mtx_lock(&d->bqueue_mtx);
582130389Sle	}
583135173Sle
584154075Sle	while ((bp = bioq_takefirst(d->bqueue)) != NULL) {
585135173Sle		mtx_unlock(&d->bqueue_mtx);
586135173Sle		if (bp->bio_cflags & GV_BIO_DONE)
587135173Sle			g_std_done(bp);
588135173Sle		else
589135173Sle			g_io_deliver(bp, ENXIO);
590135173Sle		mtx_lock(&d->bqueue_mtx);
591135173Sle	}
592135173Sle	mtx_unlock(&d->bqueue_mtx);
593135173Sle	d->flags |= GV_DRIVE_THREAD_DEAD;
594135173Sle
595172836Sjulian	kproc_exit(ENXIO);
596130389Sle}
597130389Sle
598135173Sle
599130389Slestatic void
600130389Slegv_drive_orphan(struct g_consumer *cp)
601130389Sle{
602130389Sle	struct g_geom *gp;
603130597Sle	struct gv_drive *d;
604130389Sle
605130389Sle	g_topology_assert();
606130389Sle	gp = cp->geom;
607130389Sle	g_trace(G_T_TOPOLOGY, "gv_drive_orphan(%s)", gp->name);
608130597Sle	d = gp->softc;
609130697Sle	if (d != NULL) {
610135162Sle		gv_set_drive_state(d, GV_DRIVE_DOWN,
611135162Sle		    GV_SETSTATE_FORCE | GV_SETSTATE_CONFIG);
612146325Sle		g_post_event(gv_drive_dead, d, M_WAITOK, d, NULL);
613146325Sle	} else
614146325Sle		g_wither_geom(gp, ENXIO);
615130389Sle}
616130389Sle
617130389Slestatic struct g_geom *
618130389Slegv_drive_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
619130389Sle{
620130389Sle	struct g_geom *gp, *gp2;
621130389Sle	struct g_consumer *cp;
622130389Sle	struct gv_drive *d;
623130389Sle	struct gv_sd *s;
624130389Sle	struct gv_softc *sc;
625130389Sle	struct gv_freelist *fl;
626130389Sle	struct gv_hdr *vhdr;
627130389Sle	int error;
628152773Sle	char *buf, errstr[ERRBUFSIZ];
629130389Sle
630130389Sle	vhdr = NULL;
631130389Sle	d = NULL;
632130389Sle
633130389Sle	g_trace(G_T_TOPOLOGY, "gv_drive_taste(%s, %s)", mp->name, pp->name);
634130389Sle	g_topology_assert();
635130389Sle
636130389Sle	/* Find the VINUM class and its associated geom. */
637130389Sle	gp2 = find_vinum_geom();
638130389Sle	if (gp2 == NULL)
639130389Sle		return (NULL);
640130389Sle	sc = gp2->softc;
641130389Sle
642130389Sle	gp = g_new_geomf(mp, "%s.vinumdrive", pp->name);
643133983Sle	gp->start = gv_drive_start;
644133983Sle	gp->orphan = gv_drive_orphan;
645133983Sle	gp->access = gv_drive_access;
646133983Sle	gp->start = gv_drive_start;
647130389Sle
648130389Sle	cp = g_new_consumer(gp);
649130389Sle	g_attach(cp, pp);
650130389Sle	error = g_access(cp, 1, 0, 0);
651130389Sle	if (error) {
652130389Sle		g_detach(cp);
653130389Sle		g_destroy_consumer(cp);
654130389Sle		g_destroy_geom(gp);
655130389Sle		return (NULL);
656130389Sle	}
657130389Sle
658130389Sle	g_topology_unlock();
659130389Sle
660130389Sle	/* Now check if the provided slice is a valid vinum drive. */
661130389Sle	do {
662183514Slulf		vhdr = g_malloc(GV_HDR_LEN, M_WAITOK | M_ZERO);
663183514Slulf		error = gv_read_header(cp, vhdr);
664183514Slulf		if (error) {
665130389Sle			g_free(vhdr);
666130389Sle			break;
667130389Sle		}
668130389Sle
669152773Sle		/* A valid vinum drive, let's parse the on-disk information. */
670152971Ssobomax		buf = g_read_data(cp, GV_CFG_OFFSET, GV_CFG_LEN, NULL);
671152971Ssobomax		if (buf == NULL) {
672152773Sle			g_free(vhdr);
673152773Sle			break;
674152773Sle		}
675149501Sle		g_topology_lock();
676152773Sle		gv_parse_config(sc, buf, 1);
677152773Sle		g_free(buf);
678149501Sle
679130389Sle		/*
680152773Sle		 * Let's see if this drive is already known in the
681152773Sle		 * configuration.
682130389Sle		 */
683130389Sle		d = gv_find_drive(sc, vhdr->label.name);
684130389Sle
685130389Sle		/* We already know about this drive. */
686130389Sle		if (d != NULL) {
687133983Sle			/* Check if this drive already has a geom. */
688133983Sle			if (d->geom != NULL) {
689133983Sle				g_topology_unlock();
690177345Slulf				g_free(vhdr);
691133983Sle				break;
692133983Sle			}
693130389Sle			bcopy(vhdr, d->hdr, sizeof(*vhdr));
694177345Slulf			g_free(vhdr);
695130389Sle
696130389Sle		/* This is a new drive. */
697130389Sle		} else {
698130389Sle			d = g_malloc(sizeof(*d), M_WAITOK | M_ZERO);
699130389Sle
700130389Sle			/* Initialize all needed variables. */
701130389Sle			d->size = pp->mediasize - GV_DATA_START;
702130389Sle			d->avail = d->size;
703130389Sle			d->hdr = vhdr;
704130389Sle			strncpy(d->name, vhdr->label.name, GV_MAXDRIVENAME);
705130389Sle			LIST_INIT(&d->subdisks);
706130389Sle			LIST_INIT(&d->freelist);
707130389Sle
708130389Sle			/* We also need a freelist entry. */
709130389Sle			fl = g_malloc(sizeof(*fl), M_WAITOK | M_ZERO);
710130389Sle			fl->offset = GV_DATA_START;
711130389Sle			fl->size = d->avail;
712130389Sle			LIST_INSERT_HEAD(&d->freelist, fl, freelist);
713130389Sle			d->freelist_entries = 1;
714130389Sle
715140475Sle			/* Save it into the main configuration. */
716140475Sle			LIST_INSERT_HEAD(&sc->drives, d, drive);
717140475Sle		}
718140475Sle
719140475Sle		/*
720154075Sle		 * Create bio queue, queue mutex and a worker thread, if
721154075Sle		 * necessary.
722140475Sle		 */
723154075Sle		if (d->bqueue == NULL) {
724154075Sle			d->bqueue = g_malloc(sizeof(struct bio_queue_head),
725154075Sle			    M_WAITOK | M_ZERO);
726154075Sle			bioq_init(d->bqueue);
727154075Sle		}
728140475Sle		if (mtx_initialized(&d->bqueue_mtx) == 0)
729135173Sle			mtx_init(&d->bqueue_mtx, "gv_drive", NULL, MTX_DEF);
730140475Sle
731140475Sle		if (!(d->flags & GV_DRIVE_THREAD_ACTIVE)) {
732172836Sjulian			kproc_create(gv_drive_worker, d, NULL, 0, 0,
733135173Sle			    "gv_d %s", d->name);
734135173Sle			d->flags |= GV_DRIVE_THREAD_ACTIVE;
735130389Sle		}
736130389Sle
737133983Sle		g_access(cp, -1, 0, 0);
738132617Sle
739130389Sle		gp->softc = d;
740130389Sle		d->geom = gp;
741135173Sle		d->vinumconf = sc;
742130389Sle		strncpy(d->device, pp->name, GV_MAXDRIVENAME);
743130389Sle
744130389Sle		/*
745130389Sle		 * Find out which subdisks belong to this drive and crosslink
746130389Sle		 * them.
747130389Sle		 */
748130389Sle		LIST_FOREACH(s, &sc->subdisks, sd) {
749130389Sle			if (!strncmp(s->drive, d->name, GV_MAXDRIVENAME))
750130389Sle				/* XXX: errors ignored */
751130389Sle				gv_sd_to_drive(sc, d, s, errstr,
752130389Sle				    sizeof(errstr));
753130389Sle		}
754130389Sle
755130389Sle		/* This drive is now up for sure. */
756130389Sle		gv_set_drive_state(d, GV_DRIVE_UP, 0);
757130389Sle
758130389Sle		/*
759130389Sle		 * If there are subdisks on this drive, we need to create
760130389Sle		 * providers for them.
761130389Sle		 */
762130389Sle		if (d->sdcount)
763130389Sle			gv_drive_modify(d);
764130389Sle
765130389Sle		return (gp);
766130389Sle
767130389Sle	} while (0);
768130389Sle
769130389Sle	g_topology_lock();
770130389Sle	g_access(cp, -1, 0, 0);
771130389Sle
772130389Sle	g_detach(cp);
773130389Sle	g_destroy_consumer(cp);
774130389Sle	g_destroy_geom(gp);
775130389Sle	return (NULL);
776130389Sle}
777130389Sle
778130389Sle/*
779130389Sle * Modify the providers for the given drive 'd'.  It is assumed that the
780130389Sle * subdisk list of 'd' is already correctly set up.
781130389Sle */
782130389Slevoid
783130389Slegv_drive_modify(struct gv_drive *d)
784130389Sle{
785130389Sle	struct g_geom *gp;
786130389Sle	struct g_consumer *cp;
787130389Sle	struct g_provider *pp, *pp2;
788130389Sle	struct gv_sd *s;
789130389Sle
790130389Sle	KASSERT(d != NULL, ("gv_drive_modify: null d"));
791130389Sle	gp = d->geom;
792130389Sle	KASSERT(gp != NULL, ("gv_drive_modify: null gp"));
793130389Sle	cp = LIST_FIRST(&gp->consumer);
794130389Sle	KASSERT(cp != NULL, ("gv_drive_modify: null cp"));
795130389Sle	pp = cp->provider;
796130389Sle	KASSERT(pp != NULL, ("gv_drive_modify: null pp"));
797130389Sle
798130389Sle	g_topology_assert();
799130389Sle
800130389Sle	LIST_FOREACH(s, &d->subdisks, from_drive) {
801130389Sle		/* This subdisk already has a provider. */
802130389Sle		if (s->provider != NULL)
803130389Sle			continue;
804130389Sle		pp2 = g_new_providerf(gp, "gvinum/sd/%s", s->name);
805130389Sle		pp2->mediasize = s->size;
806130389Sle		pp2->sectorsize = pp->sectorsize;
807130389Sle		g_error_provider(pp2, 0);
808130389Sle		s->provider = pp2;
809130389Sle		pp2->private = s;
810130389Sle	}
811130389Sle}
812130389Sle
813146325Slestatic void
814146325Slegv_drive_dead(void *arg, int flag)
815146325Sle{
816146325Sle	struct g_geom *gp;
817146325Sle	struct g_consumer *cp;
818146325Sle	struct gv_drive *d;
819146325Sle	struct gv_sd *s;
820146325Sle
821146325Sle	g_topology_assert();
822146325Sle	KASSERT(arg != NULL, ("gv_drive_dead: NULL arg"));
823146325Sle
824146325Sle	if (flag == EV_CANCEL)
825146325Sle		return;
826146325Sle
827146325Sle	d = arg;
828146325Sle	if (d->state != GV_DRIVE_DOWN)
829146325Sle		return;
830146325Sle
831146325Sle	g_trace(G_T_TOPOLOGY, "gv_drive_dead(%s)", d->name);
832146325Sle
833146325Sle	gp = d->geom;
834146325Sle	if (gp == NULL)
835146325Sle		return;
836146325Sle
837146325Sle	LIST_FOREACH(cp, &gp->consumer, consumer) {
838146325Sle		if (cp->nstart != cp->nend) {
839184292Slulf			G_VINUM_DEBUG(0, "dead drive '%s' still has "
840184292Slulf			    "active requests, cannot detach consumer",
841146325Sle			    d->name);
842146325Sle			g_post_event(gv_drive_dead, d, M_WAITOK, d,
843146325Sle			    NULL);
844146325Sle			return;
845146325Sle		}
846146325Sle		if (cp->acr != 0 || cp->acw != 0 || cp->ace != 0)
847146325Sle			g_access(cp, -cp->acr, -cp->acw, -cp->ace);
848146325Sle	}
849146325Sle
850184292Slulf	G_VINUM_DEBUG(1, "lost drive '%s'", d->name);
851146325Sle	d->geom = NULL;
852146325Sle	LIST_FOREACH(s, &d->subdisks, from_drive) {
853146325Sle		s->provider = NULL;
854146325Sle		s->consumer = NULL;
855146325Sle	}
856146325Sle	gv_kill_drive_thread(d);
857146325Sle	gp->softc = NULL;
858146325Sle	g_wither_geom(gp, ENXIO);
859146325Sle}
860146325Sle
861130389Slestatic int
862130389Slegv_drive_destroy_geom(struct gctl_req *req, struct g_class *mp,
863130389Sle    struct g_geom *gp)
864130389Sle{
865135173Sle	struct gv_drive *d;
866135173Sle
867130389Sle	g_trace(G_T_TOPOLOGY, "gv_drive_destroy_geom: %s", gp->name);
868130389Sle	g_topology_assert();
869130389Sle
870135173Sle	d = gp->softc;
871135173Sle	gv_kill_drive_thread(d);
872135173Sle
873130389Sle	g_wither_geom(gp, ENXIO);
874130389Sle	return (0);
875130389Sle}
876130389Sle
877130389Sle#define	VINUMDRIVE_CLASS_NAME "VINUMDRIVE"
878130389Sle
879130389Slestatic struct g_class g_vinum_drive_class = {
880130389Sle	.name = VINUMDRIVE_CLASS_NAME,
881133318Sphk	.version = G_VERSION,
882130389Sle	.taste = gv_drive_taste,
883130389Sle	.destroy_geom = gv_drive_destroy_geom
884130389Sle};
885130389Sle
886130389SleDECLARE_GEOM_CLASS(g_vinum_drive_class, g_vinum_drive);
887