devfs_vfsops.c revision 7862:f8b6a07acfd6
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * This is the device filesystem.
28 *
29 * It is a combination of a namer to drive autoconfiguration,
30 * plus the access methods for the device drivers of the system.
31 *
32 * The prototype is fairly dependent on specfs for the latter part
33 * of its implementation, though a final version would integrate the two.
34 */
35#include <sys/types.h>
36#include <sys/param.h>
37#include <sys/sysmacros.h>
38#include <sys/systm.h>
39#include <sys/kmem.h>
40#include <sys/time.h>
41#include <sys/pathname.h>
42#include <sys/vfs.h>
43#include <sys/vfs_opreg.h>
44#include <sys/vnode.h>
45#include <sys/stat.h>
46#include <sys/uio.h>
47#include <sys/stat.h>
48#include <sys/errno.h>
49#include <sys/cmn_err.h>
50#include <sys/cred.h>
51#include <sys/statvfs.h>
52#include <sys/mount.h>
53#include <sys/debug.h>
54#include <sys/modctl.h>
55#include <fs/fs_subr.h>
56#include <sys/fs/dv_node.h>
57#include <sys/fs/snode.h>
58#include <sys/sunndi.h>
59#include <sys/policy.h>
60#include <sys/sunmdi.h>
61
62/*
63 * devfs vfs operations.
64 */
65static int devfs_mount(struct vfs *, struct vnode *, struct mounta *,
66    struct cred *);
67static int devfs_unmount(struct vfs *, int, struct cred *);
68static int devfs_root(struct vfs *, struct vnode **);
69static int devfs_statvfs(struct vfs *, struct statvfs64 *);
70static int devfs_mountroot(struct vfs *, enum whymountroot);
71
72static int devfsinit(int, char *);
73
74static vfsdef_t devfs_vfssw = {
75	VFSDEF_VERSION,
76	"devfs",	/* type name string */
77	devfsinit,	/* init routine */
78	0,		/* flags */
79	NULL		/* mount options table prototype */
80};
81
82static kmutex_t devfs_lock;	/* protects global data */
83static int devfstype;		/* fstype */
84static dev_t devfsdev;		/* the fictious 'device' we live on */
85static struct devfs_data *devfs_mntinfo;	/* linked list of instances */
86
87/*
88 * Module linkage information
89 */
90static struct modlfs modlfs = {
91	&mod_fsops, "devices filesystem", &devfs_vfssw
92};
93
94static struct modlinkage modlinkage = {
95	MODREV_1, (void *)&modlfs, NULL
96};
97
98int
99_init(void)
100{
101	int e;
102
103	mutex_init(&devfs_lock, "devfs lock", MUTEX_DEFAULT, NULL);
104	dv_node_cache_init();
105	if ((e = mod_install(&modlinkage)) != 0) {
106		dv_node_cache_fini();
107		mutex_destroy(&devfs_lock);
108		return (e);
109	}
110	dcmn_err(("devfs loaded\n"));
111	return (0);
112}
113
114int
115_fini(void)
116{
117	return (EBUSY);
118}
119
120int
121_info(struct modinfo *modinfop)
122{
123	return (mod_info(&modlinkage, modinfop));
124}
125
126/*ARGSUSED1*/
127static int
128devfsinit(int fstype, char *name)
129{
130	static const fs_operation_def_t devfs_vfsops_template[] = {
131		VFSNAME_MOUNT,		{ .vfs_mount = devfs_mount },
132		VFSNAME_UNMOUNT,	{ .vfs_unmount = devfs_unmount },
133		VFSNAME_ROOT,		{ .vfs_root = devfs_root },
134		VFSNAME_STATVFS,	{ .vfs_statvfs = devfs_statvfs },
135		VFSNAME_SYNC,		{ .vfs_sync = fs_sync },
136		VFSNAME_MOUNTROOT,	{ .vfs_mountroot = devfs_mountroot },
137		NULL,			NULL
138	};
139	int error;
140	int dev;
141	extern major_t getudev(void);	/* gack - what a function */
142
143	devfstype = fstype;
144	/*
145	 * Associate VFS ops vector with this fstype
146	 */
147	error = vfs_setfsops(fstype, devfs_vfsops_template, NULL);
148	if (error != 0) {
149		cmn_err(CE_WARN, "devfsinit: bad vfs ops template");
150		return (error);
151	}
152
153	error = vn_make_ops("dev fs", dv_vnodeops_template, &dv_vnodeops);
154	if (error != 0) {
155		(void) vfs_freevfsops_by_type(fstype);
156		cmn_err(CE_WARN, "devfsinit: bad vnode ops template");
157		return (error);
158	}
159
160	/*
161	 * Invent a dev_t (sigh).
162	 */
163	if ((dev = getudev()) == DDI_MAJOR_T_NONE) {
164		cmn_err(CE_NOTE, "%s: can't get unique dev", devfs_vfssw.name);
165		dev = 0;
166	}
167	devfsdev = makedevice(dev, 0);
168
169	return (0);
170}
171
172/*
173 * The name of the mount point and the name of the attribute
174 * filesystem are passed down from userland for now.
175 */
176static int
177devfs_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap,
178    struct cred *cr)
179{
180	struct devfs_data *devfs_data;
181	struct vnode *avp;
182	struct dv_node *dv;
183	struct vattr va;
184
185	dcmn_err(("devfs_mount\n"));
186
187	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
188		return (EPERM);
189
190	/*
191	 * check that the mount point is sane
192	 */
193	if (mvp->v_type != VDIR)
194		return (ENOTDIR);
195
196	ASSERT(uap->flags & MS_SYSSPACE);
197	/*
198	 * Devfs can only be mounted from kernel during boot.
199	 * avp is the existing /devices, the same as the mount point.
200	 */
201	avp = mvp;
202
203	/*
204	 * Create and initialize the vfs-private data.
205	 * This includes a hand-crafted root vnode (we build
206	 * this here mostly so that traverse() doesn't sleep
207	 * in VFS_ROOT()).
208	 */
209	mutex_enter(&devfs_lock);
210	ASSERT(devfs_mntinfo == NULL);
211	dv = dv_mkroot(vfsp, devfsdev);
212	dv->dv_attrvp = avp;		/* attribute root vp */
213
214	ASSERT(dv == dv->dv_dotdot);
215
216	devfs_data = kmem_zalloc(sizeof (struct devfs_data), KM_SLEEP);
217	devfs_data->devfs_vfsp = vfsp;
218	devfs_data->devfs_root = dv;
219
220	vfsp->vfs_data = (caddr_t)devfs_data;
221	vfsp->vfs_fstype = devfstype;
222	vfsp->vfs_dev = devfsdev;
223	vfsp->vfs_bsize = DEV_BSIZE;
224	vfsp->vfs_mtime = ddi_get_time();
225	vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, devfstype);
226
227	/* We're there. */
228	devfs_mntinfo = devfs_data;
229	mutex_exit(&devfs_lock);
230
231	va.va_mask = AT_ATIME|AT_MTIME;
232	gethrestime(&va.va_atime);
233	gethrestime(&va.va_mtime);
234	(void) VOP_SETATTR(DVTOV(dv), &va, 0, cr, NULL);
235	return (0);
236}
237
238
239/*
240 * We never unmount devfs in a real production system.
241 */
242/*ARGSUSED*/
243static int
244devfs_unmount(struct vfs *vfsp, int flag, struct cred *cr)
245{
246	return (EBUSY);
247}
248
249/*
250 * return root vnode for given vfs
251 */
252static int
253devfs_root(struct vfs *vfsp, struct vnode **vpp)
254{
255	dcmn_err(("devfs_root\n"));
256	*vpp = DVTOV(VFSTODVFS(vfsp)->devfs_root);
257	VN_HOLD(*vpp);
258	return (0);
259}
260
261/*
262 * return 'generic superblock' information to userland.
263 *
264 * not much that we can usefully admit to here
265 */
266static int
267devfs_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
268{
269	extern kmem_cache_t *dv_node_cache;
270
271	dev32_t d32;
272
273	dcmn_err(("devfs_statvfs\n"));
274	bzero(sbp, sizeof (*sbp));
275	sbp->f_frsize = sbp->f_bsize = vfsp->vfs_bsize;
276	/*
277	 * We could compute the number of devfsnodes here .. but since
278	 * it's dynamic anyway, it's not clear how useful this is.
279	 */
280	sbp->f_files = kmem_cache_stat(dv_node_cache, "alloc");
281
282	/* no illusions that free/avail files is relevant to devfs */
283	sbp->f_ffree = 0;
284	sbp->f_favail = 0;
285
286	/* no illusions that blocks are relevant to devfs */
287	sbp->f_bfree = 0;
288	sbp->f_bavail = 0;
289	sbp->f_blocks = 0;
290
291	(void) cmpldev(&d32, vfsp->vfs_dev);
292	sbp->f_fsid = d32;
293	(void) strcpy(sbp->f_basetype, vfssw[devfstype].vsw_name);
294	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
295	sbp->f_namemax = MAXNAMELEN - 1;
296	(void) strcpy(sbp->f_fstr, "devices");
297
298	return (0);
299}
300
301/*
302 * devfs always mount after root is mounted, so this should never
303 * be invoked.
304 */
305/*ARGSUSED*/
306static int
307devfs_mountroot(struct vfs *vfsp, enum whymountroot why)
308{
309	dcmn_err(("devfs_mountroot\n"));
310
311	return (EINVAL);
312}
313
314struct dv_node *
315devfs_dip_to_dvnode(dev_info_t *dip)
316{
317	char *dirpath;
318	struct vnode *dirvp;
319
320	ASSERT(dip != NULL);
321
322	/* no-op if devfs not mounted yet */
323	if (devfs_mntinfo == NULL)
324		return (NULL);
325
326	/*
327	 * The lookupname below only looks up cached dv_nodes
328	 * because devfs_clean_key is set in thread specific data.
329	 */
330	dirpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
331	(void) ddi_pathname(dip, dirpath);
332	if (devfs_lookupname(dirpath, NULLVPP, &dirvp)) {
333		dcmn_err(("directory %s not found\n", dirpath));
334		kmem_free(dirpath, MAXPATHLEN);
335		return (NULL);
336	}
337
338	kmem_free(dirpath, MAXPATHLEN);
339	return (VTODV(dirvp));
340}
341
342/*
343 * If DV_CLEAN_FORCE devfs_clean is issued with a dip that is not the root
344 * and not a vHCI we also need to clean any vHCI branches because they
345 * may contain pHCI nodes. A detach_node() of a pHCI will fail if its
346 * mdi_devi_offline() fails, and the mdi_devi_offline() of the last
347 * pHCI will fail unless an ndi_devi_offline() of the Client nodes under
348 * the vHCI is successful - which requires a clean vHCI branch to removed
349 * the devi_refs associated with devfs vnodes.
350 */
351static int
352devfs_clean_vhci(dev_info_t *dip, void *args)
353{
354	struct dv_node	*dvp;
355	uint_t		flags = (uint_t)(uintptr_t)args;
356
357	(void) tsd_set(devfs_clean_key, (void *)1);
358	dvp = devfs_dip_to_dvnode(dip);
359	if (dvp) {
360		(void) dv_cleandir(dvp, NULL, flags);
361		VN_RELE(DVTOV(dvp));
362	}
363	(void) tsd_set(devfs_clean_key, NULL);
364	return (DDI_WALK_CONTINUE);
365}
366
367/*
368 * devfs_clean()
369 *
370 * Destroy unreferenced dv_node's and detach devices.
371 *
372 * devfs_clean will try its best to clean up unused nodes. It is
373 * no longer valid to assume that just because devfs_clean fails,
374 * the device is not removable. This is because device contracts
375 * can result in userland processes releasing a device during the
376 * device offline process in the kernel. Thus it is no longer
377 * correct to fail an offline just because devfs_clean finds
378 * referenced dv_nodes. To enforce this, devfs_clean() always
379 * returns success i.e. 0.
380 *
381 * devfs_clean() may return before removing all possible nodes if
382 * we cannot acquire locks in areas of the code where potential for
383 * deadlock exists (see comments in dv_find() and dv_cleandir() for
384 * examples of this).
385 *
386 * devfs caches unreferenced dv_node to speed by the performance
387 * of ls, find, etc. devfs_clean() is invoked to cleanup cached
388 * dv_nodes to reclaim memory as well as to facilitate device
389 * removal (dv_node reference devinfo nodes, which prevents driver
390 * detach).
391 *
392 * If a shell parks in a /devices directory, the dv_node will be
393 * held, preventing the corresponding device to be detached.
394 * This would be a denial of service against DR. To prevent this,
395 * DR code calls devfs_clean() with the DV_CLEAN_FORCE flag.
396 * The dv_cleandir() implementation does the right thing to ensure
397 * successful DR.
398 */
399int
400devfs_clean(dev_info_t *dip, char *devnm, uint_t flags)
401{
402	struct dv_node		*dvp;
403
404	dcmn_err(("devfs_unconfigure: dip = 0x%p, flags = 0x%x",
405	    (void *)dip, flags));
406
407	/* avoid recursion back into the device tree */
408	(void) tsd_set(devfs_clean_key, (void *)1);
409	dvp = devfs_dip_to_dvnode(dip);
410	if (dvp == NULL) {
411		(void) tsd_set(devfs_clean_key, NULL);
412		return (0);
413	}
414
415	(void) dv_cleandir(dvp, devnm, flags);
416	(void) tsd_set(devfs_clean_key, NULL);
417	VN_RELE(DVTOV(dvp));
418
419	/*
420	 * If we are doing a DV_CLEAN_FORCE, and we did not start at the
421	 * root, and we did not start at a vHCI node then clean vHCI
422	 * branches too.  Failure to clean vHCI branch does not cause EBUSY.
423	 *
424	 * Also, to accommodate nexus callers that clean 'self' to DR 'child'
425	 * (like pcihp) we clean vHCIs even when dv_cleandir() of dip branch
426	 * above fails - this prevents a busy DR 'child' sibling from causing
427	 * the DR of 'child' to fail because a vHCI branch was not cleaned.
428	 */
429	if ((flags & DV_CLEAN_FORCE) && (dip != ddi_root_node()) &&
430	    (mdi_component_is_vhci(dip, NULL) != MDI_SUCCESS)) {
431		/*
432		 * NOTE: for backport the following is recommended
433		 * 	(void) devfs_clean_vhci(scsi_vhci_dip,
434		 *	    (void *)(uintptr_t)flags);
435		 */
436		mdi_walk_vhcis(devfs_clean_vhci, (void *)(uintptr_t)flags);
437	}
438
439	return (0);
440}
441
442/*
443 * lookup a devfs relative pathname, returning held vnodes for the final
444 * component and the containing directory (if requested).
445 *
446 * NOTE: We can't use lookupname because this would use the current
447 *	processes credentials (CRED) in the call lookuppnvp instead
448 *	of kcred.  It also does not give you the flexibility so
449 * 	specify the directory to start the resolution in (devicesdir).
450 */
451int
452devfs_lookupname(
453	char	*pathname,		/* user pathname */
454	vnode_t **dirvpp,		/* ret for ptr to parent dir vnode */
455	vnode_t **compvpp)		/* ret for ptr to component vnode */
456{
457	struct pathname	pn;
458	int		error;
459
460	ASSERT(devicesdir);		/* devfs must be initialized */
461	ASSERT(pathname);		/* must have some path */
462
463	if (error = pn_get(pathname, UIO_SYSSPACE, &pn))
464		return (error);
465
466	/* make the path relative to /devices. */
467	pn_skipslash(&pn);
468	if (pn_pathleft(&pn) == 0) {
469		/* all we had was "\0" or "/" (which skipslash skiped) */
470		if (dirvpp)
471			*dirvpp = NULL;
472		if (compvpp) {
473			VN_HOLD(devicesdir);
474			*compvpp = devicesdir;
475		}
476	} else {
477		/*
478		 * Use devfs lookup to resolve pathname to the vnode for
479		 * the device via relative lookup in devfs. Extra holds for
480		 * using devicesdir as directory we are searching and for
481		 * being our root without being == rootdir.
482		 */
483		VN_HOLD(devicesdir);
484		VN_HOLD(devicesdir);
485		error = lookuppnvp(&pn, NULL, FOLLOW, dirvpp, compvpp,
486		    devicesdir, devicesdir, kcred);
487	}
488	pn_free(&pn);
489
490	return (error);
491}
492
493/*
494 * Given a devfs path (without the /devices prefix), walk
495 * the dv_node sub-tree rooted at the path.
496 */
497int
498devfs_walk(
499	char		*path,
500	void		(*callback)(struct dv_node *, void *),
501	void		*arg)
502{
503	char *dirpath, *devnm;
504	struct vnode	*dirvp;
505
506	ASSERT(path && callback);
507
508	if (*path != '/' || devfs_mntinfo == NULL)
509		return (ENXIO);
510
511	dcmn_err(("devfs_walk: path = %s", path));
512
513	dirpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
514
515	(void) snprintf(dirpath, MAXPATHLEN, "/devices%s", path);
516
517	devnm = strrchr(dirpath, '/');
518
519	ASSERT(devnm);
520
521	*devnm++ = '\0';
522
523	if (lookupname(dirpath, UIO_SYSSPACE, 0, NULL, &dirvp)) {
524		dcmn_err(("directory %s not found\n", dirpath));
525		kmem_free(dirpath, MAXPATHLEN);
526		return (ENXIO);
527	}
528
529	/*
530	 * if path == "/", visit the root dv_node
531	 */
532	if (*devnm == '\0') {
533		callback(VTODV(dirvp), arg);
534		devnm = NULL;
535	}
536
537	dv_walk(VTODV(dirvp), devnm, callback, arg);
538
539	VN_RELE(dirvp);
540
541	kmem_free(dirpath, MAXPATHLEN);
542
543	return (0);
544}
545
546int
547devfs_devpolicy(vnode_t *vp, devplcy_t **dpp)
548{
549	struct vnode *rvp;
550	struct dv_node *dvp;
551	int rval = -1;
552
553	/* fail if devfs not mounted yet */
554	if (devfs_mntinfo == NULL)
555		return (rval);
556
557	if (VOP_REALVP(vp, &rvp, NULL) == 0 && vn_matchops(rvp, dv_vnodeops)) {
558		dvp = VTODV(rvp);
559		rw_enter(&dvp->dv_contents, RW_READER);
560		if (dvp->dv_priv) {
561			dphold(dvp->dv_priv);
562			*dpp = dvp->dv_priv;
563			rval = 0;
564		}
565		rw_exit(&dvp->dv_contents);
566	}
567	return (rval);
568}
569