1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/*
26 * vnode ops for the devfs
27 *
28 * For leaf vnode special files (VCHR|VBLK) specfs will always see the VOP
29 * first because dv_find always performs leaf vnode substitution, returning
30 * a specfs vnode with an s_realvp pointing to the devfs leaf vnode. This
31 * means that the only leaf special file VOP operations that devfs will see
32 * after VOP_LOOKUP are the ones that specfs forwards.
33 */
34
35#include <sys/types.h>
36#include <sys/param.h>
37#include <sys/t_lock.h>
38#include <sys/systm.h>
39#include <sys/sysmacros.h>
40#include <sys/user.h>
41#include <sys/time.h>
42#include <sys/vfs.h>
43#include <sys/vnode.h>
44#include <sys/vfs_opreg.h>
45#include <sys/file.h>
46#include <sys/fcntl.h>
47#include <sys/flock.h>
48#include <sys/kmem.h>
49#include <sys/uio.h>
50#include <sys/errno.h>
51#include <sys/stat.h>
52#include <sys/cred.h>
53#include <sys/dirent.h>
54#include <sys/pathname.h>
55#include <sys/cmn_err.h>
56#include <sys/debug.h>
57#include <sys/policy.h>
58#include <sys/modctl.h>
59#include <sys/sunndi.h>
60#include <fs/fs_subr.h>
61#include <sys/fs/dv_node.h>
62
63extern struct vattr	dv_vattr_dir, dv_vattr_file;
64extern dev_t rconsdev;
65
66/*
67 * Open of devices (leaf nodes) is handled by specfs.
68 * There is nothing to do to open a directory
69 */
70/*ARGSUSED*/
71static int
72devfs_open(struct vnode **vpp, int flag, struct cred *cred,
73    caller_context_t *ct)
74{
75	struct dv_node	*dv = VTODV(*vpp);
76
77	dcmn_err2(("devfs_open %s\n", dv->dv_name));
78	ASSERT((*vpp)->v_type == VDIR);
79	return (0);
80}
81
82/*
83 * Close of devices (leaf nodes) is handled by specfs.
84 * There is nothing much to do inorder to close a directory.
85 */
86/*ARGSUSED1*/
87static int
88devfs_close(struct vnode *vp, int flag, int count,
89    offset_t offset, struct cred *cred, caller_context_t *ct)
90{
91	struct dv_node	*dv = VTODV(vp);
92
93	dcmn_err2(("devfs_close %s\n", dv->dv_name));
94	ASSERT(vp->v_type == VDIR);
95
96	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
97	cleanshares(vp, ttoproc(curthread)->p_pid);
98	return (0);
99}
100
101/*
102 * Read of devices (leaf nodes) is handled by specfs.
103 * Read of directories is not supported.
104 */
105/*ARGSUSED*/
106static int
107devfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
108	struct caller_context *ct)
109{
110	dcmn_err2(("devfs_read %s\n", VTODV(vp)->dv_name));
111	ASSERT(vp->v_type == VDIR);
112	ASSERT(RW_READ_HELD(&VTODV(vp)->dv_contents));
113	return (EISDIR);
114}
115
116/*
117 * Write of devices (leaf nodes) is handled by specfs.
118 * Write of directories is not supported.
119 */
120/*ARGSUSED*/
121static int
122devfs_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
123	struct caller_context *ct)
124{
125	dcmn_err2(("devfs_write %s\n", VTODV(vp)->dv_name));
126	ASSERT(vp->v_type == VDIR);
127	ASSERT(RW_WRITE_HELD(&VTODV(vp)->dv_contents));
128	return (EISDIR);
129}
130
131/*
132 * Ioctls to device (leaf nodes) is handled by specfs.
133 * Ioctl to directories is not supported.
134 */
135/*ARGSUSED*/
136static int
137devfs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
138    struct cred *cred, int *rvalp, caller_context_t *ct)
139{
140	dcmn_err2(("devfs_ioctl %s\n", VTODV(vp)->dv_name));
141	ASSERT(vp->v_type == VDIR);
142
143	return (ENOTTY);	/* no ioctls supported */
144}
145
146/*
147 * We can be asked directly about the attributes of directories, or
148 * (via sp->s_realvp) about the filesystem attributes of special files.
149 *
150 * For directories, we just believe the attribute store
151 * though we mangle the nodeid, fsid, and rdev to convince userland we
152 * really are a different filesystem.
153 *
154 * For special files, a little more fakery is required.
155 *
156 * If the attribute store is not there (read only root), we believe our
157 * memory based attributes.
158 */
159static int
160devfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
161    caller_context_t *ct)
162{
163	struct dv_node	*dv = VTODV(vp);
164	int		error = 0;
165	uint_t		mask;
166
167	/*
168	 * Message goes to console only. Otherwise, the message
169	 * causes devfs_getattr to be invoked again... infinite loop
170	 */
171	dcmn_err2(("?devfs_getattr %s\n", dv->dv_name));
172	ASSERT(dv->dv_attr || dv->dv_attrvp);
173
174	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
175		cmn_err(CE_WARN,	/* panic ? */
176		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
177		return (ENOENT);
178	}
179
180	rw_enter(&dv->dv_contents, RW_READER);
181	if (dv->dv_attr) {
182		/*
183		 * obtain from the memory version of attribute.
184		 * preserve mask for those that optimize.
185		 * devfs specific fields are already merged on creation.
186		 */
187		mask = vap->va_mask;
188		*vap = *dv->dv_attr;
189		vap->va_mask = mask;
190	} else {
191		/* obtain from attribute store and merge */
192		error = VOP_GETATTR(dv->dv_attrvp, vap, flags, cr, ct);
193		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
194		dv_vattr_merge(dv, vap);
195	}
196	rw_exit(&dv->dv_contents);
197
198	/*
199	 * Restrict the permissions of the node fronting the console
200	 * to 0600 with root as the owner.  This prevents a non-root
201	 * user from gaining access to a serial terminal (like /dev/term/a)
202	 * which is in reality serving as the console device (/dev/console).
203	 */
204	if (vp->v_rdev == rconsdev) {
205		mode_t	rconsmask = S_IXUSR|S_IRWXG|S_IRWXO;
206		vap->va_mode &= (~rconsmask);
207		vap->va_uid = 0;
208	}
209
210	return (error);
211}
212
213static int devfs_unlocked_access(void *, int, struct cred *);
214
215/*ARGSUSED4*/
216static int
217devfs_setattr_dir(
218	struct dv_node *dv,
219	struct vnode *vp,
220	struct vattr *vap,
221	int flags,
222	struct cred *cr)
223{
224	struct vattr	*map;
225	uint_t		mask;
226	int		error = 0;
227	struct vattr	vattr;
228
229	ASSERT(dv->dv_attr || dv->dv_attrvp);
230
231	ASSERT(vp->v_type == VDIR);
232	ASSERT((dv->dv_flags & DV_NO_FSPERM) == 0);
233
234	if (vap->va_mask & AT_NOSET)
235		return (EINVAL);
236
237	/* to ensure consistency, single thread setting of attributes */
238	rw_enter(&dv->dv_contents, RW_WRITER);
239
240again:	if (dv->dv_attr) {
241
242		error = secpolicy_vnode_setattr(cr, vp, vap,
243		    dv->dv_attr, flags, devfs_unlocked_access, dv);
244
245		if (error)
246			goto out;
247
248		/*
249		 * Apply changes to the memory based attribute. This code
250		 * is modeled after the tmpfs implementation of memory
251		 * based vnodes
252		 */
253		map = dv->dv_attr;
254		mask = vap->va_mask;
255
256		/* Change file access modes. */
257		if (mask & AT_MODE) {
258			map->va_mode &= S_IFMT;
259			map->va_mode |= vap->va_mode & ~S_IFMT;
260		}
261		if (mask & AT_UID)
262			map->va_uid = vap->va_uid;
263		if (mask & AT_GID)
264			map->va_gid = vap->va_gid;
265		if (mask & AT_ATIME)
266			map->va_atime = vap->va_atime;
267		if (mask & AT_MTIME)
268			map->va_mtime = vap->va_mtime;
269
270		if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME))
271			gethrestime(&map->va_ctime);
272	} else {
273		/* use the backing attribute store */
274		ASSERT(dv->dv_attrvp);
275
276		/*
277		 * See if we are changing something we care about
278		 * the persistence of - return success if we don't care.
279		 */
280		if (vap->va_mask & (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) {
281			/* Set the attributes */
282			error = VOP_SETATTR(dv->dv_attrvp,
283			    vap, flags, cr, NULL);
284			dsysdebug(error,
285			    ("vop_setattr %s %d\n", dv->dv_name, error));
286
287			/*
288			 * Some file systems may return EROFS for a setattr
289			 * on a readonly file system.  In this case we create
290			 * our own memory based attribute.
291			 */
292			if (error == EROFS) {
293				/*
294				 * obtain attributes from existing file
295				 * that we will modify and switch to memory
296				 * based attribute until attribute store is
297				 * read/write.
298				 */
299				vattr = dv_vattr_dir;
300				if (VOP_GETATTR(dv->dv_attrvp,
301				    &vattr, flags, cr, NULL) == 0) {
302					dv->dv_attr = kmem_alloc(
303					    sizeof (struct vattr), KM_SLEEP);
304					*dv->dv_attr = vattr;
305					dv_vattr_merge(dv, dv->dv_attr);
306					goto again;
307				}
308			}
309		}
310	}
311out:
312	rw_exit(&dv->dv_contents);
313	return (error);
314}
315
316
317/*
318 * Compare the uid/gid/mode changes requested for a setattr
319 * operation with the same details of a node's default minor
320 * perm information.  Return 0 if identical.
321 */
322static int
323dv_setattr_cmp(struct vattr *map, mperm_t *mp)
324{
325	if ((map->va_mode & S_IAMB) != (mp->mp_mode & S_IAMB))
326		return (1);
327	if (map->va_uid != mp->mp_uid)
328		return (1);
329	if (map->va_gid != mp->mp_gid)
330		return (1);
331	return (0);
332}
333
334
335/*ARGSUSED4*/
336static int
337devfs_setattr(
338	struct vnode *vp,
339	struct vattr *vap,
340	int flags,
341	struct cred *cr,
342	caller_context_t *ct)
343{
344	struct dv_node	*dv = VTODV(vp);
345	struct dv_node	*ddv;
346	struct vnode	*dvp;
347	struct vattr	*map;
348	uint_t		mask;
349	int		error = 0;
350	struct vattr	*free_vattr = NULL;
351	struct vattr	*vattrp = NULL;
352	mperm_t		mp;
353	int		persist;
354
355	/*
356	 * Message goes to console only. Otherwise, the message
357	 * causes devfs_getattr to be invoked again... infinite loop
358	 */
359	dcmn_err2(("?devfs_setattr %s\n", dv->dv_name));
360	ASSERT(dv->dv_attr || dv->dv_attrvp);
361
362	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
363		cmn_err(CE_WARN,	/* panic ? */
364		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
365		return (ENOENT);
366	}
367
368	if (vap->va_mask & AT_NOSET)
369		return (EINVAL);
370
371	/*
372	 * If we are changing something we don't care about
373	 * the persistence of, return success.
374	 */
375	if ((vap->va_mask &
376	    (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) == 0)
377		return (0);
378
379	/*
380	 * If driver overrides fs perm, disallow chmod
381	 * and do not create attribute nodes.
382	 */
383	if (dv->dv_flags & DV_NO_FSPERM) {
384		ASSERT(dv->dv_attr);
385		if (vap->va_mask & (AT_MODE | AT_UID | AT_GID))
386			return (EPERM);
387		if ((vap->va_mask & (AT_ATIME|AT_MTIME)) == 0)
388			return (0);
389		rw_enter(&dv->dv_contents, RW_WRITER);
390		if (vap->va_mask & AT_ATIME)
391			dv->dv_attr->va_atime = vap->va_atime;
392		if (vap->va_mask & AT_MTIME)
393			dv->dv_attr->va_mtime = vap->va_mtime;
394		rw_exit(&dv->dv_contents);
395		return (0);
396	}
397
398	/*
399	 * Directories are always created but device nodes are
400	 * only used to persist non-default permissions.
401	 */
402	if (vp->v_type == VDIR) {
403		ASSERT(dv->dv_attr || dv->dv_attrvp);
404		return (devfs_setattr_dir(dv, vp, vap, flags, cr));
405	}
406
407	/*
408	 * Allocate now before we take any locks
409	 */
410	vattrp = kmem_zalloc(sizeof (*vattrp), KM_SLEEP);
411
412	/* to ensure consistency, single thread setting of attributes */
413	rw_enter(&dv->dv_contents, RW_WRITER);
414
415	/*
416	 * We don't need to create an attribute node
417	 * to persist access or modification times.
418	 */
419	persist = (vap->va_mask & (AT_MODE | AT_UID | AT_GID));
420
421	/*
422	 * If persisting something, get the default permissions
423	 * for this minor to compare against what the attributes
424	 * are now being set to.  Default ordering is:
425	 *	- minor_perm match for this minor
426	 *	- mode supplied by ddi_create_priv_minor_node
427	 *	- devfs defaults
428	 */
429	if (persist) {
430		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) != 0) {
431			mp.mp_uid = dv_vattr_file.va_uid;
432			mp.mp_gid = dv_vattr_file.va_gid;
433			mp.mp_mode = dv_vattr_file.va_mode;
434			if (dv->dv_flags & DV_DFLT_MODE) {
435				ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
436				mp.mp_mode &= ~S_IAMB;
437				mp.mp_mode |= dv->dv_dflt_mode;
438				dcmn_err5(("%s: setattr priv default 0%o\n",
439				    dv->dv_name, mp.mp_mode));
440			} else {
441				dcmn_err5(("%s: setattr devfs default 0%o\n",
442				    dv->dv_name, mp.mp_mode));
443			}
444		} else {
445			dcmn_err5(("%s: setattr minor perm default 0%o\n",
446			    dv->dv_name, mp.mp_mode));
447		}
448	}
449
450	/*
451	 * If we don't have a vattr for this node, construct one.
452	 */
453	if (dv->dv_attr) {
454		free_vattr = vattrp;
455		vattrp = NULL;
456	} else {
457		ASSERT(dv->dv_attrvp);
458		ASSERT(vp->v_type != VDIR);
459		*vattrp = dv_vattr_file;
460		error = VOP_GETATTR(dv->dv_attrvp, vattrp, 0, cr, ct);
461		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
462		if (error)
463			goto out;
464		dv->dv_attr = vattrp;
465		dv_vattr_merge(dv, dv->dv_attr);
466		vattrp = NULL;
467	}
468
469	error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
470	    flags, devfs_unlocked_access, dv);
471	if (error) {
472		dsysdebug(error, ("devfs_setattr %s secpolicy error %d\n",
473		    dv->dv_name, error));
474		goto out;
475	}
476
477	/*
478	 * Apply changes to the memory based attribute. This code
479	 * is modeled after the tmpfs implementation of memory
480	 * based vnodes
481	 */
482	map = dv->dv_attr;
483	mask = vap->va_mask;
484
485	/* Change file access modes. */
486	if (mask & AT_MODE) {
487		map->va_mode &= S_IFMT;
488		map->va_mode |= vap->va_mode & ~S_IFMT;
489	}
490	if (mask & AT_UID)
491		map->va_uid = vap->va_uid;
492	if (mask & AT_GID)
493		map->va_gid = vap->va_gid;
494	if (mask & AT_ATIME)
495		map->va_atime = vap->va_atime;
496	if (mask & AT_MTIME)
497		map->va_mtime = vap->va_mtime;
498
499	if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME)) {
500		gethrestime(&map->va_ctime);
501	}
502
503	/*
504	 * A setattr to defaults means we no longer need the
505	 * shadow node as a persistent store, unless there
506	 * are ACLs.  Otherwise create a shadow node if one
507	 * doesn't exist yet.
508	 */
509	if (persist) {
510		if ((dv_setattr_cmp(map, &mp) == 0) &&
511		    ((dv->dv_flags & DV_ACL) == 0)) {
512
513			if (dv->dv_attrvp) {
514				ddv = dv->dv_dotdot;
515				ASSERT(ddv->dv_attrvp);
516				error = VOP_REMOVE(ddv->dv_attrvp,
517				    dv->dv_name, cr, ct, 0);
518				dsysdebug(error,
519				    ("vop_remove %s %s %d\n",
520				    ddv->dv_name, dv->dv_name, error));
521
522				if (error == EROFS)
523					error = 0;
524				VN_RELE(dv->dv_attrvp);
525				dv->dv_attrvp = NULL;
526			}
527			ASSERT(dv->dv_attr);
528		} else {
529			if (mask & AT_MODE)
530				dcmn_err5(("%s persisting mode 0%o\n",
531				    dv->dv_name, vap->va_mode));
532			if (mask & AT_UID)
533				dcmn_err5(("%s persisting uid %d\n",
534				    dv->dv_name, vap->va_uid));
535			if (mask & AT_GID)
536				dcmn_err5(("%s persisting gid %d\n",
537				    dv->dv_name, vap->va_gid));
538
539			if (dv->dv_attrvp == NULL) {
540				dvp = DVTOV(dv->dv_dotdot);
541				dv_shadow_node(dvp, dv->dv_name, vp,
542				    NULL, NULLVP, cr,
543				    DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
544			}
545			if (dv->dv_attrvp) {
546				/* If map still valid do TIME for free. */
547				if (dv->dv_attr == map) {
548					mask = map->va_mask;
549					map->va_mask =
550					    vap->va_mask | AT_ATIME | AT_MTIME;
551					error = VOP_SETATTR(dv->dv_attrvp, map,
552					    flags, cr, NULL);
553					map->va_mask = mask;
554				} else {
555					error = VOP_SETATTR(dv->dv_attrvp,
556					    vap, flags, cr, NULL);
557				}
558				dsysdebug(error, ("vop_setattr %s %d\n",
559				    dv->dv_name, error));
560			}
561			/*
562			 * Some file systems may return EROFS for a setattr
563			 * on a readonly file system.  In this case save
564			 * as our own memory based attribute.
565			 * NOTE: ufs is NOT one of these (see ufs_iupdat).
566			 */
567			if (dv->dv_attr && dv->dv_attrvp && error == 0) {
568				vattrp = dv->dv_attr;
569				dv->dv_attr = NULL;
570			} else if (error == EROFS)
571				error = 0;
572		}
573	}
574
575out:
576	rw_exit(&dv->dv_contents);
577
578	if (vattrp)
579		kmem_free(vattrp, sizeof (*vattrp));
580	if (free_vattr)
581		kmem_free(free_vattr, sizeof (*free_vattr));
582	return (error);
583}
584
585static int
586devfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
587    caller_context_t *ct)
588{
589	switch (cmd) {
590	case _PC_ACL_ENABLED:
591		/*
592		 * We rely on the underlying filesystem for ACLs,
593		 * so direct the query for ACL support there.
594		 * ACL support isn't relative to the file
595		 * and we can't guarantee that the dv node
596		 * has an attribute node, so any valid
597		 * attribute node will suffice.
598		 */
599		ASSERT(dvroot);
600		ASSERT(dvroot->dv_attrvp);
601		return (VOP_PATHCONF(dvroot->dv_attrvp, cmd, valp, cr, ct));
602		/*NOTREACHED*/
603	}
604
605	return (fs_pathconf(vp, cmd, valp, cr, ct));
606}
607
608/*
609 * Let avp handle security attributes (acl's).
610 */
611static int
612devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
613    struct cred *cr, caller_context_t *ct)
614{
615	dvnode_t *dv = VTODV(vp);
616	struct vnode *avp;
617	int	error;
618
619	dcmn_err2(("devfs_getsecattr %s\n", dv->dv_name));
620	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
621
622	rw_enter(&dv->dv_contents, RW_READER);
623
624	avp = dv->dv_attrvp;
625
626	/* fabricate the acl */
627	if (avp == NULL) {
628		error = fs_fab_acl(vp, vsap, flags, cr, ct);
629		rw_exit(&dv->dv_contents);
630		return (error);
631	}
632
633	error = VOP_GETSECATTR(avp, vsap, flags, cr, ct);
634	dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
635	rw_exit(&dv->dv_contents);
636	return (error);
637}
638
639/*
640 * Set security attributes (acl's)
641 *
642 * Note that the dv_contents lock has already been acquired
643 * by the caller's VOP_RWLOCK.
644 */
645static int
646devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
647    struct cred *cr, caller_context_t *ct)
648{
649	dvnode_t *dv = VTODV(vp);
650	struct vnode *avp;
651	int	error;
652
653	dcmn_err2(("devfs_setsecattr %s\n", dv->dv_name));
654	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
655	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
656
657	/*
658	 * Not a supported operation on drivers not providing
659	 * file system based permissions.
660	 */
661	if (dv->dv_flags & DV_NO_FSPERM)
662		return (ENOTSUP);
663
664	/*
665	 * To complete, the setsecattr requires an underlying attribute node.
666	 */
667	if (dv->dv_attrvp == NULL) {
668		ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
669		dv_shadow_node(DVTOV(dv->dv_dotdot), dv->dv_name, vp,
670		    NULL, NULLVP, cr, DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
671	}
672
673	if ((avp = dv->dv_attrvp) == NULL) {
674		dcmn_err2(("devfs_setsecattr %s: "
675		    "cannot construct attribute node\n", dv->dv_name));
676		return (fs_nosys());
677	}
678
679	/*
680	 * The acl(2) system call issues a VOP_RWLOCK before setting an ACL.
681	 * Since backing file systems expect the lock to be held before seeing
682	 * a VOP_SETSECATTR ACL, we need to issue the VOP_RWLOCK to the backing
683	 * store before forwarding the ACL.
684	 */
685	(void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, NULL);
686	error = VOP_SETSECATTR(avp, vsap, flags, cr, ct);
687	dsysdebug(error, ("vop_setsecattr %s %d\n", VTODV(vp)->dv_name, error));
688	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
689
690	/*
691	 * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
692	 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
693	 * VOP_GETSECATTR calls.
694	 */
695	if (fs_acl_nontrivial(avp, cr))
696		dv->dv_flags |= DV_ACL;
697	return (error);
698}
699
700/*
701 * This function is used for secpolicy_setattr().  It must call an
702 * access() like function while it is already holding the
703 * dv_contents lock.  We only care about this when dv_attr != NULL;
704 * so the unlocked access call only concerns itself with that
705 * particular branch of devfs_access().
706 */
707static int
708devfs_unlocked_access(void *vdv, int mode, struct cred *cr)
709{
710	struct dv_node *dv = vdv;
711	int shift = 0;
712	uid_t owner = dv->dv_attr->va_uid;
713
714	/* Check access based on owner, group and public permissions. */
715	if (crgetuid(cr) != owner) {
716		shift += 3;
717		if (groupmember(dv->dv_attr->va_gid, cr) == 0)
718			shift += 3;
719	}
720
721	return (secpolicy_vnode_access2(cr, DVTOV(dv), owner,
722	    dv->dv_attr->va_mode << shift, mode));
723}
724
725static int
726devfs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
727    caller_context_t *ct)
728{
729	struct dv_node	*dv = VTODV(vp);
730	int		res;
731
732	dcmn_err2(("devfs_access %s\n", dv->dv_name));
733	ASSERT(dv->dv_attr || dv->dv_attrvp);
734
735	/* restrict console access to privileged processes */
736	if ((vp->v_rdev == rconsdev) && secpolicy_console(cr) != 0) {
737		return (EACCES);
738	}
739
740	rw_enter(&dv->dv_contents, RW_READER);
741	if (dv->dv_attr && ((dv->dv_flags & DV_ACL) == 0)) {
742		res = devfs_unlocked_access(dv, mode, cr);
743	} else {
744		res = VOP_ACCESS(dv->dv_attrvp, mode, flags, cr, ct);
745	}
746	rw_exit(&dv->dv_contents);
747	return (res);
748}
749
750/*
751 * Lookup
752 *
753 * Given the directory vnode and the name of the component, return
754 * the corresponding held vnode for that component.
755 *
756 * Of course in these fictional filesystems, nothing's ever quite
757 * -that- simple.
758 *
759 * devfs name	type		shadow (fs attributes)	type	comments
760 * -------------------------------------------------------------------------
761 * drv[@addr]	VDIR		drv[@addr]		VDIR	nexus driver
762 * drv[@addr]:m	VCHR/VBLK	drv[@addr]:m		VREG	leaf driver
763 * drv[@addr]	VCHR/VBLK	drv[@addr]:.default	VREG	leaf driver
764 * -------------------------------------------------------------------------
765 *
766 * The following names are reserved for the attribute filesystem (which
767 * could easily be another layer on top of this one - we simply need to
768 * hold the vnode of the thing we're looking at)
769 *
770 * attr name	type		shadow (fs attributes)	type	comments
771 * -------------------------------------------------------------------------
772 * drv[@addr]	VDIR		-			-	attribute dir
773 * minorname	VDIR		-			-	minorname
774 * attribute	VREG		-			-	attribute
775 * -------------------------------------------------------------------------
776 *
777 * Examples:
778 *
779 *	devfs:/devices/.../mm@0:zero		VCHR
780 *	shadow:/.devices/.../mm@0:zero		VREG, fs attrs
781 *	devfs:/devices/.../mm@0:/zero/attr	VREG, driver attribute
782 *
783 *	devfs:/devices/.../sd@0,0:a		VBLK
784 *	shadow:/.devices/.../sd@0,0:a		VREG, fs attrs
785 *	devfs:/devices/.../sd@0,0:/a/.type	VREG, "ddi_block:chan"
786 *
787 *	devfs:/devices/.../mm@0			VCHR
788 *	shadow:/.devices/.../mm@0:.default	VREG, fs attrs
789 *	devfs:/devices/.../mm@0:/.default/attr	VREG, driver attribute
790 *	devfs:/devices/.../mm@0:/.default/.type	VREG, "ddi_pseudo"
791 *
792 *	devfs:/devices/.../obio			VDIR
793 *	shadow:/devices/.../obio		VDIR, needed for fs attrs.
794 *	devfs:/devices/.../obio:/.default/attr	VDIR, driver attribute
795 *
796 * We also need to be able deal with "old" devices that have gone away,
797 * though I think that provided we return them with readdir, they can
798 * be removed (i.e. they don't have to respond to lookup, though it might
799 * be weird if they didn't ;-)
800 *
801 * Lookup has side-effects.
802 *
803 * - It will create directories and fs attribute files in the shadow hierarchy.
804 * - It should cause non-SID devices to be probed (ask the parent nexi).
805 */
806/*ARGSUSED3*/
807static int
808devfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
809    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
810    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
811{
812	ASSERT(dvp->v_type == VDIR);
813	dcmn_err2(("devfs_lookup: %s\n", nm));
814	return (dv_find(VTODV(dvp), nm, vpp, pnp, rdir, cred, 0));
815}
816
817/*
818 * devfs nodes can't really be created directly by userland - however,
819 * we do allow creates to find existing nodes:
820 *
821 * - any create fails if the node doesn't exist - EROFS.
822 * - creating an existing directory read-only succeeds, otherwise EISDIR.
823 * - exclusive creates fail if the node already exists - EEXIST.
824 * - failure to create the snode for an existing device - ENOSYS.
825 */
826/*ARGSUSED2*/
827static int
828devfs_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
829    int mode, struct vnode **vpp, struct cred *cred, int flag,
830    caller_context_t *ct, vsecattr_t *vsecp)
831{
832	int error;
833	struct vnode *vp;
834
835	dcmn_err2(("devfs_create %s\n", nm));
836	error = dv_find(VTODV(dvp), nm, &vp, NULL, NULLVP, cred, 0);
837	if (error == 0) {
838		if (excl == EXCL)
839			error = EEXIST;
840		else if (vp->v_type == VDIR && (mode & VWRITE))
841			error = EISDIR;
842		else
843			error = VOP_ACCESS(vp, mode, 0, cred, ct);
844
845		if (error) {
846			VN_RELE(vp);
847		} else
848			*vpp = vp;
849	} else if (error == ENOENT)
850		error = EROFS;
851
852	return (error);
853}
854
855/*
856 * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
857 * Otherwise, simply return cached dv_node's. Hotplug code always call
858 * devfs_clean() to invalid the dv_node cache.
859 */
860/*ARGSUSED5*/
861static int
862devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
863    caller_context_t *ct, int flags)
864{
865	struct dv_node *ddv, *dv;
866	struct dirent64 *de, *bufp;
867	offset_t diroff;
868	offset_t	soff;
869	size_t reclen, movesz;
870	int error;
871	struct vattr va;
872	size_t bufsz;
873
874	ddv = VTODV(dvp);
875	dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
876	    ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
877	ASSERT(ddv->dv_attr || ddv->dv_attrvp);
878	ASSERT(RW_READ_HELD(&ddv->dv_contents));
879
880	if (uiop->uio_loffset >= MAXOFF_T) {
881		if (eofp)
882			*eofp = 1;
883		return (0);
884	}
885
886	if (uiop->uio_iovcnt != 1)
887		return (EINVAL);
888
889	if (dvp->v_type != VDIR)
890		return (ENOTDIR);
891
892	/* Load the initial contents */
893	if (ddv->dv_flags & DV_BUILD) {
894		if (!rw_tryupgrade(&ddv->dv_contents)) {
895			rw_exit(&ddv->dv_contents);
896			rw_enter(&ddv->dv_contents, RW_WRITER);
897		}
898
899		/* recheck and fill */
900		if (ddv->dv_flags & DV_BUILD)
901			dv_filldir(ddv);
902
903		rw_downgrade(&ddv->dv_contents);
904	}
905
906	soff = uiop->uio_loffset;
907	bufsz = uiop->uio_iov->iov_len;
908	de = bufp = kmem_alloc(bufsz, KM_SLEEP);
909	movesz = 0;
910	dv = (struct dv_node *)-1;
911
912	/*
913	 * Move as many entries into the uio structure as it will take.
914	 * Special case "." and "..".
915	 */
916	diroff = 0;
917	if (soff == 0) {				/* . */
918		reclen = DIRENT64_RECLEN(strlen("."));
919		if ((movesz + reclen) > bufsz)
920			goto full;
921		de->d_ino = (ino64_t)ddv->dv_ino;
922		de->d_off = (off64_t)diroff + 1;
923		de->d_reclen = (ushort_t)reclen;
924
925		/* use strncpy(9f) to zero out uninitialized bytes */
926
927		(void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
928		movesz += reclen;
929		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
930		dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
931		    "reclen %lu\n", diroff, soff, ".", reclen));
932	}
933
934	diroff++;
935	if (soff <= 1) {				/* .. */
936		reclen = DIRENT64_RECLEN(strlen(".."));
937		if ((movesz + reclen) > bufsz)
938			goto full;
939		de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
940		de->d_off = (off64_t)diroff + 1;
941		de->d_reclen = (ushort_t)reclen;
942
943		/* use strncpy(9f) to zero out uninitialized bytes */
944
945		(void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
946		movesz += reclen;
947		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
948		dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
949		    "reclen %lu\n", diroff, soff, "..", reclen));
950	}
951
952	diroff++;
953	for (dv = DV_FIRST_ENTRY(ddv); dv;
954	    dv = DV_NEXT_ENTRY(ddv, dv), diroff++) {
955		/* skip entries until at correct directory offset */
956		if (diroff < soff)
957			continue;
958
959		/*
960		 * hidden nodes are skipped (but they still occupy a
961		 * directory offset).
962		 */
963		if (dv->dv_devi && ndi_dev_is_hidden_node(dv->dv_devi))
964			continue;
965
966		/*
967		 * DDM_INTERNAL_PATH minor nodes are skipped for readdirs
968		 * outside the kernel (but they still occupy a directory
969		 * offset).
970		 */
971		if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred))
972			continue;
973
974		reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
975		if ((movesz + reclen) > bufsz) {
976			dcmn_err3(("devfs_readdir: C: diroff "
977			    "%lld, soff %lld: '%s' reclen %lu\n",
978			    diroff, soff, dv->dv_name, reclen));
979			goto full;
980		}
981		de->d_ino = (ino64_t)dv->dv_ino;
982		de->d_off = (off64_t)diroff + 1;
983		de->d_reclen = (ushort_t)reclen;
984
985		/* use strncpy(9f) to zero out uninitialized bytes */
986
987		ASSERT(strlen(dv->dv_name) + 1 <=
988		    DIRENT64_NAMELEN(reclen));
989		(void) strncpy(de->d_name, dv->dv_name,
990		    DIRENT64_NAMELEN(reclen));
991
992		movesz += reclen;
993		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
994		dcmn_err4(("devfs_readdir: D: diroff "
995		    "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
996		    dv->dv_name, reclen));
997	}
998
999	/* the buffer is full, or we exhausted everything */
1000full:	dcmn_err3(("devfs_readdir: moving %lu bytes: "
1001	    "diroff %lld, soff %lld, dv %p\n",
1002	    movesz, diroff, soff, (void *)dv));
1003
1004	if ((movesz == 0) && dv)
1005		error = EINVAL;		/* cannot be represented */
1006	else {
1007		error = uiomove(bufp, movesz, UIO_READ, uiop);
1008		if (error == 0) {
1009			if (eofp)
1010				*eofp = dv ? 0 : 1;
1011			uiop->uio_loffset = diroff;
1012		}
1013
1014		va.va_mask = AT_ATIME;
1015		gethrestime(&va.va_atime);
1016		rw_exit(&ddv->dv_contents);
1017		(void) devfs_setattr(dvp, &va, 0, cred, ct);
1018		rw_enter(&ddv->dv_contents, RW_READER);
1019	}
1020
1021	kmem_free(bufp, bufsz);
1022	return (error);
1023}
1024
1025/*ARGSUSED*/
1026static int
1027devfs_fsync(struct vnode *vp, int syncflag, struct cred *cred,
1028    caller_context_t *ct)
1029{
1030	/*
1031	 * Message goes to console only. Otherwise, the message
1032	 * causes devfs_fsync to be invoked again... infinite loop
1033	 */
1034	dcmn_err2(("devfs_fsync %s\n", VTODV(vp)->dv_name));
1035	return (0);
1036}
1037
1038/*
1039 * Normally, we leave the dv_node here at count of 0.
1040 * The node will be destroyed when dv_cleandir() is called.
1041 *
1042 * Stale dv_node's are already unlinked from the fs tree,
1043 * so dv_cleandir() won't find them. We destroy such nodes
1044 * immediately.
1045 */
1046/*ARGSUSED1*/
1047static void
1048devfs_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1049{
1050	int destroy;
1051	struct dv_node *dv = VTODV(vp);
1052
1053	dcmn_err2(("devfs_inactive: %s\n", dv->dv_name));
1054	mutex_enter(&vp->v_lock);
1055	ASSERT(vp->v_count >= 1);
1056	--vp->v_count;
1057	destroy = (DV_STALE(dv) && vp->v_count == 0);
1058	mutex_exit(&vp->v_lock);
1059
1060	/* stale nodes cannot be rediscovered, destroy it here */
1061	if (destroy)
1062		dv_destroy(dv, 0);
1063}
1064
1065/*
1066 * XXX Why do we need this?  NFS mounted /dev directories?
1067 * XXX Talk to peter staubach about this.
1068 */
1069/*ARGSUSED2*/
1070static int
1071devfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1072{
1073	struct dv_node	*dv = VTODV(vp);
1074	struct dv_fid	*dv_fid;
1075
1076	if (fidp->fid_len < (sizeof (struct dv_fid) - sizeof (ushort_t))) {
1077		fidp->fid_len = sizeof (struct dv_fid) - sizeof (ushort_t);
1078		return (ENOSPC);
1079	}
1080
1081	dv_fid = (struct dv_fid *)fidp;
1082	bzero(dv_fid, sizeof (struct dv_fid));
1083	dv_fid->dvfid_len = (int)sizeof (struct dv_fid) - sizeof (ushort_t);
1084	dv_fid->dvfid_ino = dv->dv_ino;
1085	/* dv_fid->dvfid_gen = dv->tn_gen; XXX ? */
1086
1087	return (0);
1088}
1089
1090/*
1091 * This pair of routines bracket all VOP_READ, VOP_WRITE
1092 * and VOP_READDIR requests.  The contents lock stops things
1093 * moving around while we're looking at them.
1094 *
1095 * Also used by file and record locking.
1096 */
1097/*ARGSUSED2*/
1098static int
1099devfs_rwlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1100{
1101	dcmn_err2(("devfs_rwlock %s\n", VTODV(vp)->dv_name));
1102	rw_enter(&VTODV(vp)->dv_contents, write_flag ? RW_WRITER : RW_READER);
1103	return (write_flag);
1104}
1105
1106/*ARGSUSED1*/
1107static void
1108devfs_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1109{
1110	dcmn_err2(("devfs_rwunlock %s\n", VTODV(vp)->dv_name));
1111	rw_exit(&VTODV(vp)->dv_contents);
1112}
1113
1114/*
1115 * XXX	Should probably do a better job of computing the maximum
1116 *	offset available in the directory.
1117 */
1118/*ARGSUSED1*/
1119static int
1120devfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1121    caller_context_t *ct)
1122{
1123	ASSERT(vp->v_type == VDIR);
1124	dcmn_err2(("devfs_seek %s\n", VTODV(vp)->dv_name));
1125	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1126}
1127
1128vnodeops_t *dv_vnodeops;
1129
1130const fs_operation_def_t dv_vnodeops_template[] = {
1131	VOPNAME_OPEN,		{ .vop_open = devfs_open },
1132	VOPNAME_CLOSE,		{ .vop_close = devfs_close },
1133	VOPNAME_READ,		{ .vop_read = devfs_read },
1134	VOPNAME_WRITE,		{ .vop_write = devfs_write },
1135	VOPNAME_IOCTL,		{ .vop_ioctl = devfs_ioctl },
1136	VOPNAME_GETATTR,	{ .vop_getattr = devfs_getattr },
1137	VOPNAME_SETATTR,	{ .vop_setattr = devfs_setattr },
1138	VOPNAME_ACCESS,		{ .vop_access = devfs_access },
1139	VOPNAME_LOOKUP,		{ .vop_lookup = devfs_lookup },
1140	VOPNAME_CREATE,		{ .vop_create = devfs_create },
1141	VOPNAME_READDIR,	{ .vop_readdir = devfs_readdir },
1142	VOPNAME_FSYNC,		{ .vop_fsync = devfs_fsync },
1143	VOPNAME_INACTIVE,	{ .vop_inactive = devfs_inactive },
1144	VOPNAME_FID,		{ .vop_fid = devfs_fid },
1145	VOPNAME_RWLOCK,		{ .vop_rwlock = devfs_rwlock },
1146	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = devfs_rwunlock },
1147	VOPNAME_SEEK,		{ .vop_seek = devfs_seek },
1148	VOPNAME_PATHCONF,	{ .vop_pathconf = devfs_pathconf },
1149	VOPNAME_DISPOSE,	{ .error = fs_error },
1150	VOPNAME_SETSECATTR,	{ .vop_setsecattr = devfs_setsecattr },
1151	VOPNAME_GETSECATTR,	{ .vop_getsecattr = devfs_getsecattr },
1152	NULL,			NULL
1153};
1154