vdev_geom.c revision 292069
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22168404Spjd * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23168404Spjd * All rights reserved.
24236155Smm *
25236155Smm * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26168404Spjd */
27168404Spjd
28168404Spjd#include <sys/zfs_context.h>
29168404Spjd#include <sys/param.h>
30168404Spjd#include <sys/kernel.h>
31168404Spjd#include <sys/bio.h>
32169303Spjd#include <sys/disk.h>
33168404Spjd#include <sys/spa.h>
34205346Spjd#include <sys/spa_impl.h>
35168404Spjd#include <sys/vdev_impl.h>
36168404Spjd#include <sys/fs/zfs.h>
37168404Spjd#include <sys/zio.h>
38168404Spjd#include <geom/geom.h>
39169303Spjd#include <geom/geom_int.h>
40168404Spjd
41168404Spjd/*
42168404Spjd * Virtual device vector for GEOM.
43168404Spjd */
44168404Spjd
45256956Ssmhstatic g_attrchanged_t vdev_geom_attrchanged;
46168404Spjdstruct g_class zfs_vdev_class = {
47168404Spjd	.name = "ZFS::VDEV",
48168404Spjd	.version = G_VERSION,
49256956Ssmh	.attrchanged = vdev_geom_attrchanged,
50168404Spjd};
51168404Spjd
52168404SpjdDECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53168404Spjd
54240868SpjdSYSCTL_DECL(_vfs_zfs_vdev);
55240868Spjd/* Don't send BIO_FLUSH. */
56267992Shselaskystatic int vdev_geom_bio_flush_disable;
57267992ShselaskySYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
58219089Spjd    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
59240868Spjd/* Don't send BIO_DELETE. */
60267992Shselaskystatic int vdev_geom_bio_delete_disable;
61267992ShselaskySYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
62240868Spjd    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
63219089Spjd
64168404Spjdstatic void
65256956Ssmhvdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
66256956Ssmh{
67256956Ssmh	int error;
68256956Ssmh	uint16_t rate;
69256956Ssmh
70256956Ssmh	error = g_getattr("GEOM::rotation_rate", cp, &rate);
71256956Ssmh	if (error == 0)
72256956Ssmh		vd->vdev_rotation_rate = rate;
73256956Ssmh	else
74256956Ssmh		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
75256956Ssmh}
76256956Ssmh
77256956Ssmhstatic void
78256956Ssmhvdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
79256956Ssmh{
80256956Ssmh	vdev_t *vd;
81256956Ssmh
82256956Ssmh	vd = cp->private;
83256956Ssmh	if (vd == NULL)
84256956Ssmh		return;
85256956Ssmh
86256956Ssmh	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
87256956Ssmh		vdev_geom_set_rotation_rate(vd, cp);
88256956Ssmh		return;
89256956Ssmh	}
90256956Ssmh}
91256956Ssmh
92256956Ssmhstatic void
93168404Spjdvdev_geom_orphan(struct g_consumer *cp)
94168404Spjd{
95168404Spjd	vdev_t *vd;
96168404Spjd
97168404Spjd	g_topology_assert();
98168404Spjd
99168404Spjd	vd = cp->private;
100253754Smav	if (vd == NULL)
101253754Smav		return;
102168404Spjd
103219089Spjd	/*
104219089Spjd	 * Orphan callbacks occur from the GEOM event thread.
105219089Spjd	 * Concurrent with this call, new I/O requests may be
106219089Spjd	 * working their way through GEOM about to find out
107219089Spjd	 * (only once executed by the g_down thread) that we've
108219089Spjd	 * been orphaned from our disk provider.  These I/Os
109219089Spjd	 * must be retired before we can detach our consumer.
110219089Spjd	 * This is most easily achieved by acquiring the
111219089Spjd	 * SPA ZIO configuration lock as a writer, but doing
112219089Spjd	 * so with the GEOM topology lock held would cause
113219089Spjd	 * a lock order reversal.  Instead, rely on the SPA's
114219089Spjd	 * async removal support to invoke a close on this
115219089Spjd	 * vdev once it is safe to do so.
116219089Spjd	 */
117219089Spjd	zfs_post_remove(vd->vdev_spa, vd);
118185029Spjd	vd->vdev_remove_wanted = B_TRUE;
119185029Spjd	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
120168404Spjd}
121168404Spjd
122168404Spjdstatic struct g_consumer *
123203504Spjdvdev_geom_attach(struct g_provider *pp)
124168404Spjd{
125168404Spjd	struct g_geom *gp;
126168404Spjd	struct g_consumer *cp;
127168404Spjd
128168404Spjd	g_topology_assert();
129168404Spjd
130168404Spjd	ZFS_LOG(1, "Attaching to %s.", pp->name);
131168404Spjd	/* Do we have geom already? No? Create one. */
132168404Spjd	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
133169303Spjd		if (gp->flags & G_GEOM_WITHER)
134169303Spjd			continue;
135169303Spjd		if (strcmp(gp->name, "zfs::vdev") != 0)
136169303Spjd			continue;
137169303Spjd		break;
138168404Spjd	}
139168404Spjd	if (gp == NULL) {
140168404Spjd		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
141168404Spjd		gp->orphan = vdev_geom_orphan;
142168404Spjd		cp = g_new_consumer(gp);
143168404Spjd		if (g_attach(cp, pp) != 0) {
144168404Spjd			g_wither_geom(gp, ENXIO);
145168404Spjd			return (NULL);
146168404Spjd		}
147203504Spjd		if (g_access(cp, 1, 0, 1) != 0) {
148168404Spjd			g_wither_geom(gp, ENXIO);
149168404Spjd			return (NULL);
150168404Spjd		}
151168404Spjd		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
152168404Spjd	} else {
153168404Spjd		/* Check if we are already connected to this provider. */
154168404Spjd		LIST_FOREACH(cp, &gp->consumer, consumer) {
155168404Spjd			if (cp->provider == pp) {
156168404Spjd				ZFS_LOG(1, "Found consumer for %s.", pp->name);
157168404Spjd				break;
158168404Spjd			}
159168404Spjd		}
160168404Spjd		if (cp == NULL) {
161168404Spjd			cp = g_new_consumer(gp);
162168404Spjd			if (g_attach(cp, pp) != 0) {
163168404Spjd				g_destroy_consumer(cp);
164168404Spjd				return (NULL);
165168404Spjd			}
166203504Spjd			if (g_access(cp, 1, 0, 1) != 0) {
167168404Spjd				g_detach(cp);
168168404Spjd				g_destroy_consumer(cp);
169168404Spjd				return (NULL);
170168404Spjd			}
171168404Spjd			ZFS_LOG(1, "Created consumer for %s.", pp->name);
172168404Spjd		} else {
173203504Spjd			if (g_access(cp, 1, 0, 1) != 0)
174168404Spjd				return (NULL);
175168404Spjd			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
176168404Spjd		}
177168404Spjd	}
178256880Smav	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
179168404Spjd	return (cp);
180168404Spjd}
181168404Spjd
182168404Spjdstatic void
183168404Spjdvdev_geom_detach(void *arg, int flag __unused)
184168404Spjd{
185168404Spjd	struct g_geom *gp;
186168404Spjd	struct g_consumer *cp;
187168404Spjd
188168404Spjd	g_topology_assert();
189168404Spjd	cp = arg;
190168404Spjd	gp = cp->geom;
191168404Spjd
192168404Spjd	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
193168404Spjd	g_access(cp, -1, 0, -1);
194168404Spjd	/* Destroy consumer on last close. */
195168404Spjd	if (cp->acr == 0 && cp->ace == 0) {
196168404Spjd		ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
197168404Spjd		if (cp->acw > 0)
198168404Spjd			g_access(cp, 0, -cp->acw, 0);
199168404Spjd		g_detach(cp);
200168404Spjd		g_destroy_consumer(cp);
201168404Spjd	}
202168404Spjd	/* Destroy geom if there are no consumers left. */
203168404Spjd	if (LIST_EMPTY(&gp->consumer)) {
204168404Spjd		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
205168404Spjd		g_wither_geom(gp, ENXIO);
206168404Spjd	}
207168404Spjd}
208168404Spjd
209292066Sasomersstatic void
210292066Sasomersnvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
211185029Spjd{
212185029Spjd
213292066Sasomers	nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
214292066Sasomers	nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
215185029Spjd}
216185029Spjd
217185029Spjdstatic int
218185029Spjdvdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
219185029Spjd{
220185029Spjd	struct bio *bp;
221185029Spjd	u_char *p;
222208682Spjd	off_t off, maxio;
223185029Spjd	int error;
224185029Spjd
225185029Spjd	ASSERT((offset % cp->provider->sectorsize) == 0);
226185029Spjd	ASSERT((size % cp->provider->sectorsize) == 0);
227185029Spjd
228185029Spjd	bp = g_alloc_bio();
229185029Spjd	off = offset;
230185029Spjd	offset += size;
231185029Spjd	p = data;
232208682Spjd	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
233185029Spjd	error = 0;
234185029Spjd
235208682Spjd	for (; off < offset; off += maxio, p += maxio, size -= maxio) {
236185029Spjd		bzero(bp, sizeof(*bp));
237185029Spjd		bp->bio_cmd = cmd;
238185029Spjd		bp->bio_done = NULL;
239185029Spjd		bp->bio_offset = off;
240208682Spjd		bp->bio_length = MIN(size, maxio);
241185029Spjd		bp->bio_data = p;
242185029Spjd		g_io_request(bp, cp);
243185029Spjd		error = biowait(bp, "vdev_geom_io");
244185029Spjd		if (error != 0)
245185029Spjd			break;
246185029Spjd	}
247185029Spjd
248185029Spjd	g_destroy_bio(bp);
249185029Spjd	return (error);
250185029Spjd}
251185029Spjd
252241286Savgstatic void
253241286Savgvdev_geom_taste_orphan(struct g_consumer *cp)
254185029Spjd{
255241286Savg
256241286Savg	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
257241286Savg	    cp->provider->name));
258241286Savg}
259241286Savg
260241286Savgstatic int
261241286Savgvdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
262241286Savg{
263185029Spjd	struct g_provider *pp;
264185029Spjd	vdev_label_t *label;
265185029Spjd	char *p, *buf;
266185029Spjd	size_t buflen;
267185029Spjd	uint64_t psize;
268185029Spjd	off_t offset, size;
269292066Sasomers	uint64_t state, txg;
270219089Spjd	int error, l, len;
271185029Spjd
272185029Spjd	g_topology_assert_not();
273185029Spjd
274185029Spjd	pp = cp->provider;
275241286Savg	ZFS_LOG(1, "Reading config from %s...", pp->name);
276185029Spjd
277185029Spjd	psize = pp->mediasize;
278185029Spjd	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
279185029Spjd
280185029Spjd	size = sizeof(*label) + pp->sectorsize -
281185029Spjd	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
282185029Spjd
283185029Spjd	label = kmem_alloc(size, KM_SLEEP);
284185029Spjd	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
285185029Spjd
286241286Savg	*config = NULL;
287185174Spjd	for (l = 0; l < VDEV_LABELS; l++) {
288185029Spjd
289185029Spjd		offset = vdev_label_offset(psize, l, 0);
290185029Spjd		if ((offset % pp->sectorsize) != 0)
291185029Spjd			continue;
292185029Spjd
293200124Spjd		if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
294185029Spjd			continue;
295185029Spjd		buf = label->vl_vdev_phys.vp_nvlist;
296185029Spjd
297241286Savg		if (nvlist_unpack(buf, buflen, config, 0) != 0)
298185029Spjd			continue;
299185029Spjd
300241286Savg		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
301252056Ssmh		    &state) != 0 || state > POOL_STATE_L2CACHE) {
302241286Savg			nvlist_free(*config);
303241286Savg			*config = NULL;
304241286Savg			continue;
305241286Savg		}
306241286Savg
307242135Savg		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
308242135Savg		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
309242135Savg		    &txg) != 0 || txg == 0)) {
310241286Savg			nvlist_free(*config);
311241286Savg			*config = NULL;
312241286Savg			continue;
313241286Savg		}
314241286Savg
315241286Savg		break;
316185029Spjd	}
317185029Spjd
318185029Spjd	kmem_free(label, size);
319241286Savg	return (*config == NULL ? ENOENT : 0);
320185029Spjd}
321185029Spjd
322243502Savgstatic void
323243502Savgresize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
324241286Savg{
325243502Savg	nvlist_t **new_configs;
326243502Savg	uint64_t i;
327243502Savg
328243502Savg	if (id < *count)
329243502Savg		return;
330244635Savg	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
331244635Savg	    KM_SLEEP);
332243502Savg	for (i = 0; i < *count; i++)
333243502Savg		new_configs[i] = (*configs)[i];
334243502Savg	if (*configs != NULL)
335243502Savg		kmem_free(*configs, *count * sizeof(void *));
336243502Savg	*configs = new_configs;
337243502Savg	*count = id + 1;
338243502Savg}
339243502Savg
340243502Savgstatic void
341243502Savgprocess_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
342243502Savg    const char *name, uint64_t* known_pool_guid)
343243502Savg{
344243502Savg	nvlist_t *vdev_tree;
345243502Savg	uint64_t pool_guid;
346243502Savg	uint64_t vdev_guid, known_guid;
347243502Savg	uint64_t id, txg, known_txg;
348241286Savg	char *pname;
349243502Savg	int i;
350241286Savg
351243502Savg	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
352241286Savg	    strcmp(pname, name) != 0)
353243502Savg		goto ignore;
354241286Savg
355243502Savg	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
356243502Savg		goto ignore;
357241286Savg
358243502Savg	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
359243502Savg		goto ignore;
360241286Savg
361243502Savg	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
362243502Savg		goto ignore;
363243502Savg
364243502Savg	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
365243502Savg		goto ignore;
366243502Savg
367243502Savg	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
368243502Savg
369243502Savg	if (*known_pool_guid != 0) {
370243502Savg		if (pool_guid != *known_pool_guid)
371243502Savg			goto ignore;
372243502Savg	} else
373243502Savg		*known_pool_guid = pool_guid;
374243502Savg
375243502Savg	resize_configs(configs, count, id);
376243502Savg
377243502Savg	if ((*configs)[id] != NULL) {
378243502Savg		VERIFY(nvlist_lookup_uint64((*configs)[id],
379243502Savg		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
380243502Savg		if (txg <= known_txg)
381243502Savg			goto ignore;
382243502Savg		nvlist_free((*configs)[id]);
383243502Savg	}
384243502Savg
385243502Savg	(*configs)[id] = cfg;
386243502Savg	return;
387243502Savg
388243502Savgignore:
389243502Savg	nvlist_free(cfg);
390241286Savg}
391241286Savg
392241286Savgstatic int
393241286Savgvdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp)
394241286Savg{
395241286Savg	int error;
396241286Savg
397241286Savg	if (pp->flags & G_PF_WITHER)
398241286Savg		return (EINVAL);
399241286Savg	g_attach(cp, pp);
400241286Savg	error = g_access(cp, 1, 0, 0);
401259168Smav	if (error == 0) {
402259168Smav		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize))
403259168Smav			error = EINVAL;
404259168Smav		else if (pp->mediasize < SPA_MINDEVSIZE)
405259168Smav			error = EINVAL;
406259168Smav		if (error != 0)
407259168Smav			g_access(cp, -1, 0, 0);
408259168Smav	}
409241286Savg	if (error != 0)
410241286Savg		g_detach(cp);
411241286Savg	return (error);
412241286Savg}
413241286Savg
414169303Spjdstatic void
415242332Sdelphijvdev_geom_detach_taster(struct g_consumer *cp)
416169303Spjd{
417241286Savg	g_access(cp, -1, 0, 0);
418241286Savg	g_detach(cp);
419241286Savg}
420169303Spjd
421241286Savgint
422243502Savgvdev_geom_read_pool_label(const char *name,
423243502Savg    nvlist_t ***configs, uint64_t *count)
424241286Savg{
425241286Savg	struct g_class *mp;
426241286Savg	struct g_geom *gp, *zgp;
427241286Savg	struct g_provider *pp;
428241286Savg	struct g_consumer *zcp;
429241286Savg	nvlist_t *vdev_cfg;
430243502Savg	uint64_t pool_guid;
431241286Savg	int error;
432241286Savg
433241286Savg	DROP_GIANT();
434241286Savg	g_topology_lock();
435241286Savg
436241286Savg	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
437241286Savg	/* This orphan function should be never called. */
438241286Savg	zgp->orphan = vdev_geom_taste_orphan;
439241286Savg	zcp = g_new_consumer(zgp);
440241286Savg
441243502Savg	*configs = NULL;
442243502Savg	*count = 0;
443243502Savg	pool_guid = 0;
444241286Savg	LIST_FOREACH(mp, &g_classes, class) {
445241286Savg		if (mp == &zfs_vdev_class)
446241286Savg			continue;
447241286Savg		LIST_FOREACH(gp, &mp->geom, geom) {
448241286Savg			if (gp->flags & G_GEOM_WITHER)
449241286Savg				continue;
450241286Savg			LIST_FOREACH(pp, &gp->provider, provider) {
451241286Savg				if (pp->flags & G_PF_WITHER)
452241286Savg					continue;
453241286Savg				if (vdev_geom_attach_taster(zcp, pp) != 0)
454241286Savg					continue;
455241286Savg				g_topology_unlock();
456241286Savg				error = vdev_geom_read_config(zcp, &vdev_cfg);
457241286Savg				g_topology_lock();
458242332Sdelphij				vdev_geom_detach_taster(zcp);
459241286Savg				if (error)
460241286Savg					continue;
461241286Savg				ZFS_LOG(1, "successfully read vdev config");
462241286Savg
463243502Savg				process_vdev_config(configs, count,
464243502Savg				    vdev_cfg, name, &pool_guid);
465241286Savg			}
466241286Savg		}
467241286Savg	}
468241286Savg
469241286Savg	g_destroy_consumer(zcp);
470241286Savg	g_destroy_geom(zgp);
471241286Savg	g_topology_unlock();
472241286Savg	PICKUP_GIANT();
473243502Savg
474243502Savg	return (*count > 0 ? 0 : ENOENT);
475169303Spjd}
476169303Spjd
477292066Sasomersstatic void
478292066Sasomersvdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
479241286Savg{
480241286Savg	nvlist_t *config;
481241286Savg
482241286Savg	g_topology_assert_not();
483241286Savg
484292066Sasomers	*pguid = 0;
485292066Sasomers	*vguid = 0;
486241286Savg	if (vdev_geom_read_config(cp, &config) == 0) {
487292066Sasomers		nvlist_get_guids(config, pguid, vguid);
488241286Savg		nvlist_free(config);
489241286Savg	}
490241286Savg}
491241286Savg
492219089Spjdstatic struct g_consumer *
493292066Sasomersvdev_geom_attach_by_guids(uint64_t pool_guid, uint64_t vdev_guid)
494169303Spjd{
495169303Spjd	struct g_class *mp;
496169303Spjd	struct g_geom *gp, *zgp;
497169303Spjd	struct g_provider *pp;
498219089Spjd	struct g_consumer *cp, *zcp;
499292066Sasomers	uint64_t pguid, vguid;
500169303Spjd
501169303Spjd	g_topology_assert();
502169303Spjd
503169303Spjd	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
504169303Spjd	/* This orphan function should be never called. */
505169303Spjd	zgp->orphan = vdev_geom_taste_orphan;
506169303Spjd	zcp = g_new_consumer(zgp);
507169303Spjd
508219089Spjd	cp = NULL;
509169303Spjd	LIST_FOREACH(mp, &g_classes, class) {
510169303Spjd		if (mp == &zfs_vdev_class)
511169303Spjd			continue;
512169303Spjd		LIST_FOREACH(gp, &mp->geom, geom) {
513169303Spjd			if (gp->flags & G_GEOM_WITHER)
514169303Spjd				continue;
515169303Spjd			LIST_FOREACH(pp, &gp->provider, provider) {
516241286Savg				if (vdev_geom_attach_taster(zcp, pp) != 0)
517169303Spjd					continue;
518169303Spjd				g_topology_unlock();
519292066Sasomers				vdev_geom_read_guids(zcp, &pguid, &vguid);
520169303Spjd				g_topology_lock();
521242332Sdelphij				vdev_geom_detach_taster(zcp);
522292066Sasomers				if (pguid != pool_guid || vguid != vdev_guid)
523169303Spjd					continue;
524219089Spjd				cp = vdev_geom_attach(pp);
525219089Spjd				if (cp == NULL) {
526292069Sasomers					printf("ZFS WARNING: Unable to "
527292066Sasomers					    "attach to %s.\n", pp->name);
528169303Spjd					continue;
529169303Spjd				}
530219089Spjd				break;
531169303Spjd			}
532219089Spjd			if (cp != NULL)
533219089Spjd				break;
534169303Spjd		}
535219089Spjd		if (cp != NULL)
536219089Spjd			break;
537169303Spjd	}
538169303Spjdend:
539169303Spjd	g_destroy_consumer(zcp);
540169303Spjd	g_destroy_geom(zgp);
541169303Spjd	return (cp);
542169303Spjd}
543169303Spjd
544185029Spjdstatic struct g_consumer *
545292066Sasomersvdev_geom_open_by_guids(vdev_t *vd)
546168404Spjd{
547185174Spjd	struct g_consumer *cp;
548185174Spjd	char *buf;
549185174Spjd	size_t len;
550185174Spjd
551219089Spjd	g_topology_assert();
552219089Spjd
553185174Spjd	ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
554292066Sasomers	cp = vdev_geom_attach_by_guids(spa_guid(vd->vdev_spa), vd->vdev_guid);
555185174Spjd	if (cp != NULL) {
556185174Spjd		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
557185174Spjd		buf = kmem_alloc(len, KM_SLEEP);
558197842Spjd
559185174Spjd		snprintf(buf, len, "/dev/%s", cp->provider->name);
560185174Spjd		spa_strfree(vd->vdev_path);
561185174Spjd		vd->vdev_path = buf;
562185174Spjd
563292066Sasomers		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
564292066Sasomers		    (uintmax_t)spa_guid(vd->vdev_spa),
565185174Spjd		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
566185174Spjd	} else {
567292066Sasomers		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
568292066Sasomers		    (uintmax_t)spa_guid(vd->vdev_spa),
569185174Spjd		    (uintmax_t)vd->vdev_guid);
570185174Spjd	}
571185174Spjd
572185174Spjd	return (cp);
573185174Spjd}
574185174Spjd
575185174Spjdstatic struct g_consumer *
576200158Spjdvdev_geom_open_by_path(vdev_t *vd, int check_guid)
577185174Spjd{
578168404Spjd	struct g_provider *pp;
579168404Spjd	struct g_consumer *cp;
580292066Sasomers	uint64_t pguid, vguid;
581168404Spjd
582219089Spjd	g_topology_assert();
583219089Spjd
584169303Spjd	cp = NULL;
585168404Spjd	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
586169303Spjd	if (pp != NULL) {
587169303Spjd		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
588203504Spjd		cp = vdev_geom_attach(pp);
589218278Sae		if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
590218278Sae		    pp->sectorsize <= VDEV_PAD_SIZE) {
591169303Spjd			g_topology_unlock();
592292066Sasomers			vdev_geom_read_guids(cp, &pguid, &vguid);
593169303Spjd			g_topology_lock();
594292066Sasomers			if (pguid != spa_guid(vd->vdev_spa) ||
595292066Sasomers			    vguid != vd->vdev_guid) {
596169303Spjd				vdev_geom_detach(cp, 0);
597169303Spjd				cp = NULL;
598185174Spjd				ZFS_LOG(1, "guid mismatch for provider %s: "
599292066Sasomers				    "%ju:%ju != %ju:%ju.", vd->vdev_path,
600292066Sasomers				    (uintmax_t)spa_guid(vd->vdev_spa),
601292066Sasomers				    (uintmax_t)vd->vdev_guid,
602292066Sasomers				    (uintmax_t)pguid, (uintmax_t)vguid);
603185174Spjd			} else {
604185174Spjd				ZFS_LOG(1, "guid match for provider %s.",
605185029Spjd				    vd->vdev_path);
606185174Spjd			}
607169303Spjd		}
608168404Spjd	}
609169303Spjd
610185029Spjd	return (cp);
611185029Spjd}
612169303Spjd
613185029Spjdstatic int
614236155Smmvdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
615254591Sgibbs    uint64_t *logical_ashift, uint64_t *physical_ashift)
616185029Spjd{
617185029Spjd	struct g_provider *pp;
618185029Spjd	struct g_consumer *cp;
619219089Spjd	size_t bufsize;
620224791Spjd	int error;
621185029Spjd
622185029Spjd	/*
623185029Spjd	 * We must have a pathname, and it must be absolute.
624185029Spjd	 */
625185029Spjd	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
626185029Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
627185029Spjd		return (EINVAL);
628185029Spjd	}
629185029Spjd
630185029Spjd	vd->vdev_tsd = NULL;
631185029Spjd
632219089Spjd	DROP_GIANT();
633219089Spjd	g_topology_lock();
634203504Spjd	error = 0;
635205346Spjd
636292066Sasomers	if (vd->vdev_spa->spa_splitting_newspa ||
637292066Sasomers	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
638292066Sasomers	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)) {
639292066Sasomers		/*
640292066Sasomers		 * We are dealing with a vdev that hasn't been previously
641292066Sasomers		 * opened (since boot), and we are not loading an
642292066Sasomers		 * existing pool configuration.  This looks like a
643292066Sasomers		 * vdev add operation to a new or existing pool.
644292066Sasomers		 * Assume the user knows what he/she is doing and find
645292066Sasomers		 * GEOM provider by its name, ignoring GUID mismatches.
646292066Sasomers		 *
647292066Sasomers		 * XXPOLICY: It would be safer to only allow a device
648292066Sasomers		 *           that is unlabeled or labeled but missing
649292066Sasomers		 *           GUID information to be opened in this fashion,
650292066Sasomers		 *           unless we are doing a split, in which case we
651292066Sasomers		 *           should allow any guid.
652292066Sasomers		 */
653205346Spjd		cp = vdev_geom_open_by_path(vd, 0);
654292066Sasomers	} else {
655292066Sasomers		/*
656292066Sasomers		 * Try using the recorded path for this device, but only
657292066Sasomers		 * accept it if its label data contains the expected GUIDs.
658292066Sasomers		 */
659205346Spjd		cp = vdev_geom_open_by_path(vd, 1);
660205346Spjd		if (cp == NULL) {
661205346Spjd			/*
662205346Spjd			 * The device at vd->vdev_path doesn't have the
663292066Sasomers			 * expected GUIDs. The disks might have merely
664205346Spjd			 * moved around so try all other GEOM providers
665292066Sasomers			 * to find one with the right GUIDs.
666205346Spjd			 */
667292066Sasomers			cp = vdev_geom_open_by_guids(vd);
668205346Spjd		}
669169303Spjd	}
670205346Spjd
671185174Spjd	if (cp == NULL) {
672185174Spjd		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
673203504Spjd		error = ENOENT;
674218278Sae	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
675218278Sae	    !ISP2(cp->provider->sectorsize)) {
676218278Sae		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
677218278Sae		    vd->vdev_path);
678218278Sae		vdev_geom_detach(cp, 0);
679218278Sae		error = EINVAL;
680218278Sae		cp = NULL;
681209962Smm	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
682207936Spjd		int i;
683207936Spjd
684207936Spjd		for (i = 0; i < 5; i++) {
685207936Spjd			error = g_access(cp, 0, 1, 0);
686207936Spjd			if (error == 0)
687207936Spjd				break;
688207936Spjd			g_topology_unlock();
689207936Spjd			tsleep(vd, 0, "vdev", hz / 2);
690207936Spjd			g_topology_lock();
691207936Spjd		}
692203504Spjd		if (error != 0) {
693207934Spjd			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
694203504Spjd			    vd->vdev_path, error);
695203504Spjd			vdev_geom_detach(cp, 0);
696203504Spjd			cp = NULL;
697203504Spjd		}
698185174Spjd	}
699219089Spjd	g_topology_unlock();
700219089Spjd	PICKUP_GIANT();
701203504Spjd	if (cp == NULL) {
702203504Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
703203504Spjd		return (error);
704203504Spjd	}
705185029Spjd
706185029Spjd	cp->private = vd;
707208142Spjd	vd->vdev_tsd = cp;
708169303Spjd	pp = cp->provider;
709168404Spjd
710168404Spjd	/*
711168404Spjd	 * Determine the actual size of the device.
712168404Spjd	 */
713236155Smm	*max_psize = *psize = pp->mediasize;
714168404Spjd
715168404Spjd	/*
716254591Sgibbs	 * Determine the device's minimum transfer size and preferred
717254591Sgibbs	 * transfer size.
718168404Spjd	 */
719254591Sgibbs	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
720254591Sgibbs	*physical_ashift = 0;
721254591Sgibbs	if (pp->stripesize)
722254591Sgibbs		*physical_ashift = highbit(pp->stripesize) - 1;
723168404Spjd
724168404Spjd	/*
725240868Spjd	 * Clear the nowritecache settings, so that on a vdev_reopen()
726240868Spjd	 * we will try again.
727168404Spjd	 */
728168404Spjd	vd->vdev_nowritecache = B_FALSE;
729168404Spjd
730219089Spjd	if (vd->vdev_physpath != NULL)
731219089Spjd		spa_strfree(vd->vdev_physpath);
732219089Spjd	bufsize = sizeof("/dev/") + strlen(pp->name);
733219089Spjd	vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
734219089Spjd	snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
735219089Spjd
736256956Ssmh	/*
737256956Ssmh	 * Determine the device's rotation rate.
738256956Ssmh	 */
739256956Ssmh	vdev_geom_set_rotation_rate(vd, cp);
740256956Ssmh
741168404Spjd	return (0);
742168404Spjd}
743168404Spjd
744168404Spjdstatic void
745168404Spjdvdev_geom_close(vdev_t *vd)
746168404Spjd{
747168404Spjd	struct g_consumer *cp;
748168404Spjd
749208142Spjd	cp = vd->vdev_tsd;
750208142Spjd	if (cp == NULL)
751168404Spjd		return;
752208142Spjd	vd->vdev_tsd = NULL;
753219089Spjd	vd->vdev_delayed_close = B_FALSE;
754253754Smav	cp->private = NULL;	/* XXX locking */
755168404Spjd	g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
756168404Spjd}
757168404Spjd
758168404Spjdstatic void
759168404Spjdvdev_geom_io_intr(struct bio *bp)
760168404Spjd{
761219089Spjd	vdev_t *vd;
762168404Spjd	zio_t *zio;
763168404Spjd
764168404Spjd	zio = bp->bio_caller1;
765219089Spjd	vd = zio->io_vd;
766208142Spjd	zio->io_error = bp->bio_error;
767208142Spjd	if (zio->io_error == 0 && bp->bio_resid != 0)
768269407Ssmh		zio->io_error = SET_ERROR(EIO);
769264885Ssmh
770264885Ssmh	switch(zio->io_error) {
771264885Ssmh	case ENOTSUP:
772208142Spjd		/*
773264885Ssmh		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
774264885Ssmh		 * that future attempts will never succeed. In this case
775264885Ssmh		 * we set a persistent flag so that we don't bother with
776264885Ssmh		 * requests in the future.
777208142Spjd		 */
778264885Ssmh		switch(bp->bio_cmd) {
779264885Ssmh		case BIO_FLUSH:
780264885Ssmh			vd->vdev_nowritecache = B_TRUE;
781264885Ssmh			break;
782264885Ssmh		case BIO_DELETE:
783264885Ssmh			vd->vdev_notrim = B_TRUE;
784264885Ssmh			break;
785219089Spjd		}
786264885Ssmh		break;
787264885Ssmh	case ENXIO:
788264885Ssmh		if (!vd->vdev_remove_wanted) {
789264885Ssmh			/*
790264885Ssmh			 * If provider's error is set we assume it is being
791264885Ssmh			 * removed.
792264885Ssmh			 */
793264885Ssmh			if (bp->bio_to->error != 0) {
794264885Ssmh				vd->vdev_remove_wanted = B_TRUE;
795264885Ssmh				spa_async_request(zio->io_spa,
796264885Ssmh				    SPA_ASYNC_REMOVE);
797264885Ssmh			} else if (!vd->vdev_delayed_close) {
798264885Ssmh				vd->vdev_delayed_close = B_TRUE;
799264885Ssmh			}
800264885Ssmh		}
801264885Ssmh		break;
802219089Spjd	}
803208142Spjd	g_destroy_bio(bp);
804208142Spjd	zio_interrupt(zio);
805168404Spjd}
806168404Spjd
807274304Sdelphijstatic void
808168404Spjdvdev_geom_io_start(zio_t *zio)
809168404Spjd{
810168404Spjd	vdev_t *vd;
811168404Spjd	struct g_consumer *cp;
812168404Spjd	struct bio *bp;
813168404Spjd	int error;
814168404Spjd
815168404Spjd	vd = zio->io_vd;
816168404Spjd
817265152Ssmh	switch (zio->io_type) {
818265152Ssmh	case ZIO_TYPE_IOCTL:
819168404Spjd		/* XXPOLICY */
820185029Spjd		if (!vdev_readable(vd)) {
821265152Ssmh			zio->io_error = SET_ERROR(ENXIO);
822274304Sdelphij			zio_interrupt(zio);
823274304Sdelphij			return;
824269407Ssmh		} else {
825269407Ssmh			switch (zio->io_cmd) {
826269407Ssmh			case DKIOCFLUSHWRITECACHE:
827269407Ssmh				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
828269407Ssmh					break;
829269407Ssmh				if (vd->vdev_nowritecache) {
830269407Ssmh					zio->io_error = SET_ERROR(ENOTSUP);
831269407Ssmh					break;
832269407Ssmh				}
833269407Ssmh				goto sendreq;
834269407Ssmh			default:
835265152Ssmh				zio->io_error = SET_ERROR(ENOTSUP);
836168404Spjd			}
837168404Spjd		}
838168404Spjd
839274304Sdelphij		zio_execute(zio);
840274304Sdelphij		return;
841265152Ssmh	case ZIO_TYPE_FREE:
842265152Ssmh		if (vd->vdev_notrim) {
843265152Ssmh			zio->io_error = SET_ERROR(ENOTSUP);
844269407Ssmh		} else if (!vdev_geom_bio_delete_disable) {
845269407Ssmh			goto sendreq;
846265152Ssmh		}
847274304Sdelphij		zio_execute(zio);
848274304Sdelphij		return;
849168404Spjd	}
850168404Spjdsendreq:
851274619Ssmh	ASSERT(zio->io_type == ZIO_TYPE_READ ||
852274619Ssmh	    zio->io_type == ZIO_TYPE_WRITE ||
853274619Ssmh	    zio->io_type == ZIO_TYPE_FREE ||
854274619Ssmh	    zio->io_type == ZIO_TYPE_IOCTL);
855274619Ssmh
856208142Spjd	cp = vd->vdev_tsd;
857185029Spjd	if (cp == NULL) {
858265152Ssmh		zio->io_error = SET_ERROR(ENXIO);
859269407Ssmh		zio_interrupt(zio);
860274304Sdelphij		return;
861168404Spjd	}
862168404Spjd	bp = g_alloc_bio();
863168404Spjd	bp->bio_caller1 = zio;
864168404Spjd	switch (zio->io_type) {
865168404Spjd	case ZIO_TYPE_READ:
866168404Spjd	case ZIO_TYPE_WRITE:
867168404Spjd		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
868168404Spjd		bp->bio_data = zio->io_data;
869168404Spjd		bp->bio_offset = zio->io_offset;
870168404Spjd		bp->bio_length = zio->io_size;
871168404Spjd		break;
872265152Ssmh	case ZIO_TYPE_FREE:
873265152Ssmh		bp->bio_cmd = BIO_DELETE;
874265152Ssmh		bp->bio_data = NULL;
875265152Ssmh		bp->bio_offset = zio->io_offset;
876265152Ssmh		bp->bio_length = zio->io_size;
877265152Ssmh		break;
878168404Spjd	case ZIO_TYPE_IOCTL:
879269407Ssmh		bp->bio_cmd = BIO_FLUSH;
880269407Ssmh		bp->bio_flags |= BIO_ORDERED;
881269407Ssmh		bp->bio_data = NULL;
882269407Ssmh		bp->bio_offset = cp->provider->mediasize;
883269407Ssmh		bp->bio_length = 0;
884168404Spjd		break;
885168404Spjd	}
886168404Spjd	bp->bio_done = vdev_geom_io_intr;
887168404Spjd
888168404Spjd	g_io_request(bp, cp);
889168404Spjd}
890168404Spjd
891168404Spjdstatic void
892168404Spjdvdev_geom_io_done(zio_t *zio)
893168404Spjd{
894168404Spjd}
895168404Spjd
896219089Spjdstatic void
897219089Spjdvdev_geom_hold(vdev_t *vd)
898219089Spjd{
899219089Spjd}
900219089Spjd
901219089Spjdstatic void
902219089Spjdvdev_geom_rele(vdev_t *vd)
903219089Spjd{
904219089Spjd}
905219089Spjd
906168404Spjdvdev_ops_t vdev_geom_ops = {
907168404Spjd	vdev_geom_open,
908168404Spjd	vdev_geom_close,
909168404Spjd	vdev_default_asize,
910168404Spjd	vdev_geom_io_start,
911168404Spjd	vdev_geom_io_done,
912168404Spjd	NULL,
913219089Spjd	vdev_geom_hold,
914219089Spjd	vdev_geom_rele,
915168404Spjd	VDEV_TYPE_DISK,		/* name of this vdev type */
916168404Spjd	B_TRUE			/* leaf vdev */
917168404Spjd};
918