vdev_geom.c revision 185174
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/param.h>
28#include <sys/kernel.h>
29#include <sys/bio.h>
30#include <sys/disk.h>
31#include <sys/spa.h>
32#include <sys/vdev_impl.h>
33#include <sys/fs/zfs.h>
34#include <sys/zio.h>
35#include <geom/geom.h>
36#include <geom/geom_int.h>
37
38/*
39 * Virtual device vector for GEOM.
40 */
41
42struct g_class zfs_vdev_class = {
43	.name = "ZFS::VDEV",
44	.version = G_VERSION,
45};
46
47DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
48
49typedef struct vdev_geom_ctx {
50	struct g_consumer *gc_consumer;
51	int gc_state;
52	struct bio_queue_head gc_queue;
53	struct mtx gc_queue_mtx;
54} vdev_geom_ctx_t;
55
56static void
57vdev_geom_release(vdev_t *vd)
58{
59	vdev_geom_ctx_t *ctx;
60
61	ctx = vd->vdev_tsd;
62	vd->vdev_tsd = NULL;
63
64	mtx_lock(&ctx->gc_queue_mtx);
65	ctx->gc_state = 1;
66	wakeup_one(&ctx->gc_queue);
67	while (ctx->gc_state != 2)
68		msleep(&ctx->gc_state, &ctx->gc_queue_mtx, 0, "vgeom:w", 0);
69	mtx_unlock(&ctx->gc_queue_mtx);
70	mtx_destroy(&ctx->gc_queue_mtx);
71	kmem_free(ctx, sizeof(*ctx));
72}
73
74static void
75vdev_geom_orphan(struct g_consumer *cp)
76{
77	struct g_geom *gp;
78	vdev_t *vd;
79	int error;
80
81	g_topology_assert();
82
83	vd = cp->private;
84	gp = cp->geom;
85	error = cp->provider->error;
86
87	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
88	if (cp->acr + cp->acw + cp->ace > 0)
89		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
90	ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
91	g_detach(cp);
92	g_destroy_consumer(cp);
93	/* Destroy geom if there are no consumers left. */
94	if (LIST_EMPTY(&gp->consumer)) {
95		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
96		g_wither_geom(gp, error);
97	}
98	vdev_geom_release(vd);
99
100	vd->vdev_remove_wanted = B_TRUE;
101	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
102}
103
104static struct g_consumer *
105vdev_geom_attach(struct g_provider *pp, int write)
106{
107	struct g_geom *gp;
108	struct g_consumer *cp;
109
110	g_topology_assert();
111
112	ZFS_LOG(1, "Attaching to %s.", pp->name);
113	/* Do we have geom already? No? Create one. */
114	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
115		if (gp->flags & G_GEOM_WITHER)
116			continue;
117		if (strcmp(gp->name, "zfs::vdev") != 0)
118			continue;
119		break;
120	}
121	if (gp == NULL) {
122		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
123		gp->orphan = vdev_geom_orphan;
124		cp = g_new_consumer(gp);
125		if (g_attach(cp, pp) != 0) {
126			g_wither_geom(gp, ENXIO);
127			return (NULL);
128		}
129		if (g_access(cp, 1, write, 1) != 0) {
130			g_wither_geom(gp, ENXIO);
131			return (NULL);
132		}
133		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
134	} else {
135		/* Check if we are already connected to this provider. */
136		LIST_FOREACH(cp, &gp->consumer, consumer) {
137			if (cp->provider == pp) {
138				ZFS_LOG(1, "Found consumer for %s.", pp->name);
139				break;
140			}
141		}
142		if (cp == NULL) {
143			cp = g_new_consumer(gp);
144			if (g_attach(cp, pp) != 0) {
145				g_destroy_consumer(cp);
146				return (NULL);
147			}
148			if (g_access(cp, 1, write, 1) != 0) {
149				g_detach(cp);
150				g_destroy_consumer(cp);
151				return (NULL);
152			}
153			ZFS_LOG(1, "Created consumer for %s.", pp->name);
154		} else {
155			if (g_access(cp, 1, cp->acw > 0 ? 0 : write, 1) != 0)
156				return (NULL);
157			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
158		}
159	}
160	return (cp);
161}
162
163static void
164vdev_geom_detach(void *arg, int flag __unused)
165{
166	struct g_geom *gp;
167	struct g_consumer *cp;
168
169	g_topology_assert();
170	cp = arg;
171	gp = cp->geom;
172
173	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
174	g_access(cp, -1, 0, -1);
175	/* Destroy consumer on last close. */
176	if (cp->acr == 0 && cp->ace == 0) {
177		ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
178		if (cp->acw > 0)
179			g_access(cp, 0, -cp->acw, 0);
180		g_detach(cp);
181		g_destroy_consumer(cp);
182	}
183	/* Destroy geom if there are no consumers left. */
184	if (LIST_EMPTY(&gp->consumer)) {
185		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
186		g_wither_geom(gp, ENXIO);
187	}
188}
189
190static void
191vdev_geom_worker(void *arg)
192{
193	vdev_geom_ctx_t *ctx;
194	zio_t *zio;
195	struct bio *bp;
196
197	ctx = arg;
198	for (;;) {
199		mtx_lock(&ctx->gc_queue_mtx);
200		bp = bioq_takefirst(&ctx->gc_queue);
201		if (bp == NULL) {
202			if (ctx->gc_state == 1) {
203				ctx->gc_state = 2;
204				wakeup_one(&ctx->gc_state);
205				mtx_unlock(&ctx->gc_queue_mtx);
206				kproc_exit(0);
207			}
208			msleep(&ctx->gc_queue, &ctx->gc_queue_mtx,
209			    PRIBIO | PDROP, "vgeom:io", 0);
210			continue;
211		}
212		mtx_unlock(&ctx->gc_queue_mtx);
213		zio = bp->bio_caller1;
214		zio->io_error = bp->bio_error;
215		if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
216			vdev_t *vd;
217
218			/*
219			 * If we get ENOTSUP, we know that no future
220			 * attempts will ever succeed.  In this case we
221			 * set a persistent bit so that we don't bother
222			 * with the ioctl in the future.
223			 */
224			vd = zio->io_vd;
225			vd->vdev_nowritecache = B_TRUE;
226		}
227		g_destroy_bio(bp);
228		zio_interrupt(zio);
229	}
230}
231
232static uint64_t
233nvlist_get_guid(nvlist_t *list)
234{
235	nvpair_t *elem = NULL;
236	uint64_t value;
237
238	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
239		if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
240		    strcmp(nvpair_name(elem), "guid") == 0) {
241			VERIFY(nvpair_value_uint64(elem, &value) == 0);
242			return (value);
243		}
244	}
245	return (0);
246}
247
248static int
249vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
250{
251	struct bio *bp;
252	u_char *p;
253	off_t off;
254	int error;
255
256	ASSERT((offset % cp->provider->sectorsize) == 0);
257	ASSERT((size % cp->provider->sectorsize) == 0);
258
259	bp = g_alloc_bio();
260	off = offset;
261	offset += size;
262	p = data;
263	error = 0;
264
265	for (; off < offset; off += MAXPHYS, p += MAXPHYS, size -= MAXPHYS) {
266		bzero(bp, sizeof(*bp));
267		bp->bio_cmd = cmd;
268		bp->bio_done = NULL;
269		bp->bio_offset = off;
270		bp->bio_length = MIN(size, MAXPHYS);
271		bp->bio_data = p;
272		g_io_request(bp, cp);
273		error = biowait(bp, "vdev_geom_io");
274		if (error != 0)
275			break;
276	}
277
278	g_destroy_bio(bp);
279	return (error);
280}
281
282static uint64_t
283vdev_geom_read_guid(struct g_consumer *cp)
284{
285	struct g_provider *pp;
286	vdev_label_t *label;
287	char *p, *buf;
288	size_t buflen;
289	uint64_t psize;
290	off_t offset, size;
291	uint64_t guid;
292	int error, l, len;
293
294	g_topology_assert_not();
295
296	pp = cp->provider;
297
298	psize = pp->mediasize;
299	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
300
301	size = sizeof(*label) + pp->sectorsize -
302	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
303
304	guid = 0;
305	label = kmem_alloc(size, KM_SLEEP);
306	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
307
308	for (l = 0; l < VDEV_LABELS; l++) {
309		nvlist_t *config = NULL;
310
311		offset = vdev_label_offset(psize, l, 0);
312		if ((offset % pp->sectorsize) != 0)
313			continue;
314
315		error = vdev_geom_io(cp, BIO_READ, label, offset, size);
316		if (error != 0)
317			continue;
318		buf = label->vl_vdev_phys.vp_nvlist;
319
320		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
321			continue;
322
323		guid = nvlist_get_guid(config);
324		nvlist_free(config);
325		if (guid != 0)
326			break;
327	}
328
329	kmem_free(label, size);
330	if (guid != 0)
331		ZFS_LOG(1, "guid for %s is %ju", pp->name, (uintmax_t)guid);
332	return (guid);
333}
334
335struct vdev_geom_find {
336	uint64_t guid;
337	int write;
338	struct g_consumer *cp;
339};
340
341static void
342vdev_geom_taste_orphan(struct g_consumer *cp)
343{
344
345	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
346	    cp->provider->name));
347}
348
349static void
350vdev_geom_attach_by_guid_event(void *arg, int flags __unused)
351{
352	struct vdev_geom_find *ap;
353	struct g_class *mp;
354	struct g_geom *gp, *zgp;
355	struct g_provider *pp;
356	struct g_consumer *zcp;
357	uint64_t guid;
358
359	g_topology_assert();
360
361	ap = arg;
362
363	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
364	/* This orphan function should be never called. */
365	zgp->orphan = vdev_geom_taste_orphan;
366	zcp = g_new_consumer(zgp);
367
368	LIST_FOREACH(mp, &g_classes, class) {
369		if (mp == &zfs_vdev_class)
370			continue;
371		LIST_FOREACH(gp, &mp->geom, geom) {
372			if (gp->flags & G_GEOM_WITHER)
373				continue;
374			LIST_FOREACH(pp, &gp->provider, provider) {
375				if (pp->flags & G_PF_WITHER)
376					continue;
377				g_attach(zcp, pp);
378				if (g_access(zcp, 1, 0, 0) != 0) {
379					g_detach(zcp);
380					continue;
381				}
382				g_topology_unlock();
383				guid = vdev_geom_read_guid(zcp);
384				g_topology_lock();
385				g_access(zcp, -1, 0, 0);
386				g_detach(zcp);
387				if (guid != ap->guid)
388					continue;
389				ap->cp = vdev_geom_attach(pp, ap->write);
390				if (ap->cp == NULL) {
391					printf("ZFS WARNING: Cannot open %s "
392					    "for writting.\n", pp->name);
393					continue;
394				}
395				goto end;
396			}
397		}
398	}
399	ap->cp = NULL;
400end:
401	g_destroy_consumer(zcp);
402	g_destroy_geom(zgp);
403}
404
405static struct g_consumer *
406vdev_geom_attach_by_guid(uint64_t guid, int write)
407{
408	struct vdev_geom_find *ap;
409	struct g_consumer *cp;
410
411	ap = kmem_zalloc(sizeof(*ap), KM_SLEEP);
412	ap->guid = guid;
413	ap->write = write;
414	g_waitfor_event(vdev_geom_attach_by_guid_event, ap, M_WAITOK, NULL);
415	cp = ap->cp;
416	kmem_free(ap, sizeof(*ap));
417	return (cp);
418}
419
420static struct g_consumer *
421vdev_geom_open_by_guid(vdev_t *vd)
422{
423	struct g_consumer *cp;
424	char *buf;
425	size_t len;
426
427	ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
428	cp = vdev_geom_attach_by_guid(vd->vdev_guid, !!(spa_mode & FWRITE));
429	if (cp != NULL) {
430		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
431		buf = kmem_alloc(len, KM_SLEEP);
432
433		snprintf(buf, len, "/dev/%s", cp->provider->name);
434		spa_strfree(vd->vdev_path);
435		vd->vdev_path = buf;
436
437		ZFS_LOG(1, "Attach by guid [%ju] succeeded, provider %s.",
438		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
439	} else {
440		ZFS_LOG(1, "Search by guid [%ju] failed.",
441		    (uintmax_t)vd->vdev_guid);
442	}
443
444	return (cp);
445}
446
447static struct g_consumer *
448vdev_geom_open_by_path(vdev_t *vd, int check_guid)
449{
450	struct g_provider *pp;
451	struct g_consumer *cp;
452	uint64_t guid;
453
454	cp = NULL;
455	g_topology_lock();
456	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
457	if (pp != NULL) {
458		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
459		cp = vdev_geom_attach(pp, !!(spa_mode & FWRITE));
460		if (cp != NULL && check_guid) {
461			g_topology_unlock();
462			guid = vdev_geom_read_guid(cp);
463			g_topology_lock();
464			if (guid != vd->vdev_guid) {
465				vdev_geom_detach(cp, 0);
466				cp = NULL;
467				ZFS_LOG(1, "guid mismatch for provider %s: "
468				    "%ju != %ju.", vd->vdev_path,
469				    (uintmax_t)vd->vdev_guid, (uintmax_t)guid);
470			} else {
471				ZFS_LOG(1, "guid match for provider %s.",
472				    vd->vdev_path);
473			}
474		}
475	}
476	g_topology_unlock();
477
478	return (cp);
479}
480
481static int
482vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
483{
484	vdev_geom_ctx_t *ctx;
485	struct g_provider *pp;
486	struct g_consumer *cp;
487	int owned;
488
489	/*
490	 * We must have a pathname, and it must be absolute.
491	 */
492	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
493		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
494		return (EINVAL);
495	}
496
497	vd->vdev_tsd = NULL;
498
499	if ((owned = mtx_owned(&Giant)))
500		mtx_unlock(&Giant);
501	cp = vdev_geom_open_by_path(vd, 0);
502	if (cp == NULL) {
503		/*
504		 * The device at vd->vdev_path doesn't have the expected guid.
505		 * The disks might have merely moved around so try all other
506		 * geom providers to find one with the right guid.
507		 */
508		cp = vdev_geom_open_by_guid(vd);
509	}
510	if (cp == NULL)
511		cp = vdev_geom_open_by_path(vd, 1);
512	if (cp == NULL) {
513		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
514		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
515		if (owned)
516			mtx_lock(&Giant);
517		return (EACCES);
518	}
519	if (owned)
520		mtx_lock(&Giant);
521
522	cp->private = vd;
523
524	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP);
525	bioq_init(&ctx->gc_queue);
526	mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF);
527	ctx->gc_consumer = cp;
528	ctx->gc_state = 0;
529
530	vd->vdev_tsd = ctx;
531	pp = cp->provider;
532
533	kproc_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s",
534	    pp->name);
535
536	/*
537	 * Determine the actual size of the device.
538	 */
539	*psize = pp->mediasize;
540
541	/*
542	 * Determine the device's minimum transfer size.
543	 */
544	*ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
545
546	/*
547	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
548	 * try again.
549	 */
550	vd->vdev_nowritecache = B_FALSE;
551
552	return (0);
553}
554
555static void
556vdev_geom_close(vdev_t *vd)
557{
558	vdev_geom_ctx_t *ctx;
559	struct g_consumer *cp;
560
561	if ((ctx = vd->vdev_tsd) == NULL)
562		return;
563	if ((cp = ctx->gc_consumer) == NULL)
564		return;
565	vdev_geom_release(vd);
566	g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
567}
568
569static void
570vdev_geom_io_intr(struct bio *bp)
571{
572	vdev_geom_ctx_t *ctx;
573	zio_t *zio;
574
575	zio = bp->bio_caller1;
576	ctx = zio->io_vd->vdev_tsd;
577
578	if ((zio->io_error = bp->bio_error) == 0 && bp->bio_resid != 0)
579		zio->io_error = EIO;
580
581	mtx_lock(&ctx->gc_queue_mtx);
582	bioq_insert_tail(&ctx->gc_queue, bp);
583	wakeup_one(&ctx->gc_queue);
584	mtx_unlock(&ctx->gc_queue_mtx);
585}
586
587static int
588vdev_geom_io_start(zio_t *zio)
589{
590	vdev_t *vd;
591	vdev_geom_ctx_t *ctx;
592	struct g_consumer *cp;
593	struct bio *bp;
594	int error;
595
596	cp = NULL;
597
598	vd = zio->io_vd;
599	ctx = vd->vdev_tsd;
600	if (ctx != NULL)
601		cp = ctx->gc_consumer;
602
603	if (zio->io_type == ZIO_TYPE_IOCTL) {
604		/* XXPOLICY */
605		if (!vdev_readable(vd)) {
606			zio->io_error = ENXIO;
607			return (ZIO_PIPELINE_CONTINUE);
608		}
609
610		switch (zio->io_cmd) {
611
612		case DKIOCFLUSHWRITECACHE:
613
614			if (zfs_nocacheflush)
615				break;
616
617			if (vd->vdev_nowritecache) {
618				zio->io_error = ENOTSUP;
619				break;
620			}
621
622			goto sendreq;
623		default:
624			zio->io_error = ENOTSUP;
625		}
626
627		return (ZIO_PIPELINE_CONTINUE);
628	}
629sendreq:
630	if (cp == NULL) {
631		zio->io_error = ENXIO;
632		return (ZIO_PIPELINE_CONTINUE);
633	}
634	bp = g_alloc_bio();
635	bp->bio_caller1 = zio;
636	switch (zio->io_type) {
637	case ZIO_TYPE_READ:
638	case ZIO_TYPE_WRITE:
639		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
640		bp->bio_data = zio->io_data;
641		bp->bio_offset = zio->io_offset;
642		bp->bio_length = zio->io_size;
643		break;
644	case ZIO_TYPE_IOCTL:
645		bp->bio_cmd = BIO_FLUSH;
646		bp->bio_data = NULL;
647		bp->bio_offset = cp->provider->mediasize;
648		bp->bio_length = 0;
649		break;
650	}
651	bp->bio_done = vdev_geom_io_intr;
652
653	g_io_request(bp, cp);
654
655	return (ZIO_PIPELINE_STOP);
656}
657
658static void
659vdev_geom_io_done(zio_t *zio)
660{
661
662	/*
663	 * If the device returned ENXIO, then attempt we should verify if GEOM
664	 * provider has been removed. If this is the case, then we trigger an
665	 * asynchronous removal of the device.
666	 */
667	if (zio->io_error == ENXIO) {
668		vdev_t *vd = zio->io_vd;
669		vdev_geom_ctx_t *ctx;
670		struct g_provider *pp = NULL;
671
672		ctx = vd->vdev_tsd;
673		if (ctx != NULL && ctx->gc_consumer != NULL)
674			pp = ctx->gc_consumer->provider;
675
676		if (pp == NULL || (pp->flags & G_PF_ORPHAN)) {
677			vd->vdev_remove_wanted = B_TRUE;
678			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
679		}
680	}
681}
682
683vdev_ops_t vdev_geom_ops = {
684	vdev_geom_open,
685	vdev_geom_close,
686	vdev_default_asize,
687	vdev_geom_io_start,
688	vdev_geom_io_done,
689	NULL,
690	VDEV_TYPE_DISK,		/* name of this vdev type */
691	B_TRUE			/* leaf vdev */
692};
693