specvnops.c revision 3898:c788126f2a20
1284990Scy/*
2284990Scy * CDDL HEADER START
3284990Scy *
4284990Scy * The contents of this file are subject to the terms of the
5284990Scy * Common Development and Distribution License (the "License").
6284990Scy * You may not use this file except in compliance with the License.
7284990Scy *
8284990Scy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9284990Scy * or http://www.opensolaris.org/os/licensing.
10284990Scy * See the License for the specific language governing permissions
11284990Scy * and limitations under the License.
12284990Scy *
13284990Scy * When distributing Covered Code, include this CDDL HEADER in each
14284990Scy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15284990Scy * If applicable, add the following below this CDDL HEADER, with the
16284990Scy * fields enclosed by brackets "[]" replaced with your own identifying
17284990Scy * information: Portions Copyright [yyyy] [name of copyright owner]
18284990Scy *
19284990Scy * CDDL HEADER END
20284990Scy */
21284990Scy/*
22284990Scy * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23284990Scy * Use is subject to license terms.
24284990Scy */
25284990Scy
26284990Scy/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27284990Scy/*	  All Rights Reserved  	*/
28284990Scy
29284990Scy/*
30284990Scy * University Copyright- Copyright (c) 1982, 1986, 1988
31284990Scy * The Regents of the University of California
32284990Scy * All Rights Reserved
33284990Scy *
34284990Scy * University Acknowledgment- Portions of this document are derived from
35284990Scy * software developed by the University of California, Berkeley, and its
36284990Scy * contributors.
37284990Scy */
38284990Scy
39284990Scy
40284990Scy#pragma ident	"%Z%%M%	%I%	%E% SMI"
41284990Scy
42284990Scy#include <sys/types.h>
43284990Scy#include <sys/thread.h>
44284990Scy#include <sys/t_lock.h>
45284990Scy#include <sys/param.h>
46284990Scy#include <sys/systm.h>
47284990Scy#include <sys/bitmap.h>
48284990Scy#include <sys/buf.h>
49284990Scy#include <sys/cmn_err.h>
50284990Scy#include <sys/conf.h>
51284990Scy#include <sys/ddi.h>
52284990Scy#include <sys/debug.h>
53284990Scy#include <sys/dkio.h>
54284990Scy#include <sys/errno.h>
55284990Scy#include <sys/time.h>
56#include <sys/fcntl.h>
57#include <sys/flock.h>
58#include <sys/file.h>
59#include <sys/kmem.h>
60#include <sys/mman.h>
61#include <sys/open.h>
62#include <sys/swap.h>
63#include <sys/sysmacros.h>
64#include <sys/uio.h>
65#include <sys/vfs.h>
66#include <sys/vfs_opreg.h>
67#include <sys/vnode.h>
68#include <sys/stat.h>
69#include <sys/poll.h>
70#include <sys/stream.h>
71#include <sys/strsubr.h>
72#include <sys/policy.h>
73#include <sys/devpolicy.h>
74
75#include <sys/proc.h>
76#include <sys/user.h>
77#include <sys/session.h>
78#include <sys/vmsystm.h>
79#include <sys/vtrace.h>
80#include <sys/pathname.h>
81
82#include <sys/fs/snode.h>
83
84#include <vm/seg.h>
85#include <vm/seg_map.h>
86#include <vm/page.h>
87#include <vm/pvn.h>
88#include <vm/seg_dev.h>
89#include <vm/seg_vn.h>
90
91#include <fs/fs_subr.h>
92
93#include <sys/esunddi.h>
94#include <sys/autoconf.h>
95#include <sys/sunndi.h>
96
97
98static int spec_open(struct vnode **, int, struct cred *);
99static int spec_close(struct vnode *, int, int, offset_t, struct cred *);
100static int spec_read(struct vnode *, struct uio *, int, struct cred *,
101	struct caller_context *);
102static int spec_write(struct vnode *, struct uio *, int, struct cred *,
103	struct caller_context *);
104static int spec_ioctl(struct vnode *, int, intptr_t, int, struct cred *, int *);
105static int spec_getattr(struct vnode *, struct vattr *, int, struct cred *);
106static int spec_setattr(struct vnode *, struct vattr *, int, struct cred *,
107	caller_context_t *);
108static int spec_access(struct vnode *, int, int, struct cred *);
109static int spec_create(struct vnode *, char *, vattr_t *, enum vcexcl,
110    int, struct vnode **, struct cred *, int);
111static int spec_fsync(struct vnode *, int, struct cred *);
112static void spec_inactive(struct vnode *, struct cred *);
113static int spec_fid(struct vnode *, struct fid *);
114static int spec_seek(struct vnode *, offset_t, offset_t *);
115static int spec_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
116    struct flk_callback *, struct cred *);
117static int spec_realvp(struct vnode *, struct vnode **);
118
119static int spec_getpage(struct vnode *, offset_t, size_t, uint_t *, page_t **,
120    size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
121static int spec_putapage(struct vnode *, page_t *, u_offset_t *, size_t *, int,
122	struct cred *);
123static struct buf *spec_startio(struct vnode *, page_t *, u_offset_t, size_t,
124	int);
125static int spec_getapage(struct vnode *, u_offset_t, size_t, uint_t *,
126    page_t **, size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
127static int spec_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
128    uchar_t, uchar_t, uint_t, struct cred *);
129static int spec_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
130    uchar_t, uchar_t, uint_t, struct cred *);
131static int spec_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
132    uint_t, uint_t, uint_t, struct cred *);
133
134static int spec_poll(struct vnode *, short, int, short *, struct pollhead **);
135static int spec_dump(struct vnode *, caddr_t, int, int);
136static int spec_pageio(struct vnode *, page_t *, u_offset_t, size_t, int,
137    cred_t *);
138
139static int spec_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *);
140static int spec_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *);
141static int spec_pathconf(struct	vnode *, int, ulong_t *, struct cred *);
142
143#define	SN_HOLD(csp)	{ \
144	mutex_enter(&csp->s_lock); \
145	csp->s_count++; \
146	mutex_exit(&csp->s_lock); \
147}
148
149#define	SN_RELE(csp)	{ \
150	mutex_enter(&csp->s_lock); \
151	csp->s_count--; \
152	ASSERT((csp->s_count > 0) || (csp->s_vnode->v_stream == NULL)); \
153	mutex_exit(&csp->s_lock); \
154}
155
156struct vnodeops *spec_vnodeops;
157
158const fs_operation_def_t spec_vnodeops_template[] = {
159	VOPNAME_OPEN,		{ .vop_open = spec_open },
160	VOPNAME_CLOSE,		{ .vop_close = spec_close },
161	VOPNAME_READ,		{ .vop_read = spec_read },
162	VOPNAME_WRITE,		{ .vop_write = spec_write },
163	VOPNAME_IOCTL,		{ .vop_ioctl = spec_ioctl },
164	VOPNAME_GETATTR,	{ .vop_getattr = spec_getattr },
165	VOPNAME_SETATTR,	{ .vop_setattr = spec_setattr },
166	VOPNAME_ACCESS,		{ .vop_access = spec_access },
167	VOPNAME_CREATE,		{ .vop_create = spec_create },
168	VOPNAME_FSYNC,		{ .vop_fsync = spec_fsync },
169	VOPNAME_INACTIVE,	{ .vop_inactive = spec_inactive },
170	VOPNAME_FID,		{ .vop_fid = spec_fid },
171	VOPNAME_SEEK,		{ .vop_seek = spec_seek },
172	VOPNAME_PATHCONF,	{ .vop_pathconf = spec_pathconf },
173	VOPNAME_FRLOCK,		{ .vop_frlock = spec_frlock },
174	VOPNAME_REALVP,		{ .vop_realvp = spec_realvp },
175	VOPNAME_GETPAGE,	{ .vop_getpage = spec_getpage },
176	VOPNAME_PUTPAGE,	{ .vop_putpage = spec_putpage },
177	VOPNAME_MAP,		{ .vop_map = spec_map },
178	VOPNAME_ADDMAP,		{ .vop_addmap = spec_addmap },
179	VOPNAME_DELMAP,		{ .vop_delmap = spec_delmap },
180	VOPNAME_POLL,		{ .vop_poll = spec_poll },
181	VOPNAME_DUMP,		{ .vop_dump = spec_dump },
182	VOPNAME_PAGEIO,		{ .vop_pageio = spec_pageio },
183	VOPNAME_SETSECATTR,	{ .vop_setsecattr = spec_setsecattr },
184	VOPNAME_GETSECATTR,	{ .vop_getsecattr = spec_getsecattr },
185	NULL,			NULL
186};
187
188/*
189 * Return address of spec_vnodeops
190 */
191struct vnodeops *
192spec_getvnodeops(void)
193{
194	return (spec_vnodeops);
195}
196
197extern vnode_t *rconsvp;
198
199/*
200 * Acquire the serial lock on the common snode.
201 */
202#define	LOCK_CSP(csp)					\
203	mutex_enter(&csp->s_lock);			\
204	while (csp->s_flag & SLOCKED) {			\
205		csp->s_flag |= SWANT;			\
206		cv_wait(&csp->s_cv, &csp->s_lock);	\
207	}						\
208	csp->s_flag |= SLOCKED;				\
209	mutex_exit(&csp->s_lock);
210
211#define	LOCK_CSP_SIG(csp)	lock_csp_sig(csp)
212
213/*
214 * Acquire the serial lock on the common snode checking for a signal.
215 * cv_wait_sig is used to allow signals to pull us out.
216 * Return 1 if locked, 0 if interrupted
217 */
218static int
219lock_csp_sig(struct snode *csp)
220{
221	mutex_enter(&csp->s_lock);
222	while (csp->s_flag & SLOCKED) {
223		csp->s_flag |= SWANT;
224		if (!cv_wait_sig(&csp->s_cv, &csp->s_lock)) {
225			mutex_exit(&csp->s_lock);
226			/* interrupted */
227			return (0);
228		}
229	}
230	csp->s_flag |= SLOCKED;
231	mutex_exit(&csp->s_lock);
232
233	return (1);
234}
235
236/*
237 * Unlock the serial lock on the common snode
238 */
239#define	UNLOCK_CSP_LOCK_HELD(csp)			\
240	ASSERT(mutex_owned(&csp->s_lock));		\
241	if (csp->s_flag & SWANT)			\
242		cv_broadcast(&csp->s_cv);		\
243	csp->s_flag &= ~(SWANT|SLOCKED);
244
245#define	UNLOCK_CSP(csp)					\
246	mutex_enter(&csp->s_lock);			\
247	UNLOCK_CSP_LOCK_HELD(csp);			\
248	mutex_exit(&csp->s_lock);
249
250/*
251 * compute/return the size of the device
252 */
253#define	SPEC_SIZE(csp)	\
254	(((csp)->s_flag & SSIZEVALID) ? (csp)->s_size : spec_size(csp))
255
256/*
257 * Compute and return the size.  If the size in the common snode is valid then
258 * return it.  If not valid then get the size from the driver and set size in
259 * the common snode.  If the device has not been attached then we don't ask for
260 * an update from the driver- for non-streams SSIZEVALID stays unset until the
261 * device is attached. A stat of a mknod outside /devices (non-devfs) may
262 * report UNKNOWN_SIZE because the device may not be attached yet (SDIPSET not
263 * established in mknod until open time). An stat in /devices will report the
264 * size correctly.  Specfs should always call SPEC_SIZE instead of referring
265 * directly to s_size to initialize/retrieve the size of a device.
266 *
267 * XXX There is an inconsistency between block and raw - "unknown" is
268 * UNKNOWN_SIZE for VBLK and 0 for VCHR(raw).
269 */
270static u_offset_t
271spec_size(struct snode *csp)
272{
273	struct vnode	*cvp = STOV(csp);
274	u_offset_t	size;
275	int		plen;
276	uint32_t	size32;
277	dev_t		dev;
278	dev_info_t	*devi;
279	major_t		maj;
280
281	ASSERT((csp)->s_commonvp == cvp);	/* must be common node */
282
283	/* return cached value */
284	mutex_enter(&csp->s_lock);
285	if (csp->s_flag & SSIZEVALID) {
286		mutex_exit(&csp->s_lock);
287		return (csp->s_size);
288	}
289
290	/* VOP_GETATTR of mknod has not had devcnt restriction applied */
291	dev = cvp->v_rdev;
292	maj = getmajor(dev);
293	if (maj >= devcnt) {
294		/* return non-cached UNKNOWN_SIZE */
295		mutex_exit(&csp->s_lock);
296		return ((cvp->v_type == VCHR) ? 0 : UNKNOWN_SIZE);
297	}
298
299	/* establish cached zero size for streams */
300	if (STREAMSTAB(maj)) {
301		csp->s_size = 0;
302		csp->s_flag |= SSIZEVALID;
303		mutex_exit(&csp->s_lock);
304		return (0);
305	}
306
307	/*
308	 * Return non-cached UNKNOWN_SIZE if not open.
309	 *
310	 * NB: This check is bogus, calling prop_op(9E) should be gated by
311	 * attach, not open. Not having this check however opens up a new
312	 * context under which a driver's prop_op(9E) could be called. Calling
313	 * prop_op(9E) in this new context has been shown to expose latent
314	 * driver bugs (insufficient NULL pointer checks that lead to panic).
315	 * We are keeping this open check for now to avoid these panics.
316	 */
317	if (csp->s_count == 0) {
318		mutex_exit(&csp->s_lock);
319		return ((cvp->v_type == VCHR) ? 0 : UNKNOWN_SIZE);
320	}
321
322	/* Return non-cached UNKNOWN_SIZE if not attached. */
323	if (((csp->s_flag & SDIPSET) == 0) || (csp->s_dip == NULL) ||
324	    !i_ddi_devi_attached(csp->s_dip)) {
325		mutex_exit(&csp->s_lock);
326		return ((cvp->v_type == VCHR) ? 0 : UNKNOWN_SIZE);
327	}
328
329	devi = csp->s_dip;
330
331	/*
332	 * Established cached size obtained from the attached driver. Since we
333	 * know the devinfo node, for efficiency we use cdev_prop_op directly
334	 * instead of [cb]dev_[Ss]size.
335	 */
336	if (cvp->v_type == VCHR) {
337		size = 0;
338		plen = sizeof (size);
339		if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF,
340		    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS |
341		    DDI_PROP_CONSUMER_TYPED, "Size", (caddr_t)&size,
342		    &plen) != DDI_PROP_SUCCESS) {
343			plen = sizeof (size32);
344			if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF,
345			    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS,
346			    "size", (caddr_t)&size32, &plen) ==
347			    DDI_PROP_SUCCESS)
348				size = size32;
349		}
350	} else {
351		size = UNKNOWN_SIZE;
352		plen = sizeof (size);
353		if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF,
354		    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS |
355		    DDI_PROP_CONSUMER_TYPED, "Nblocks", (caddr_t)&size,
356		    &plen) != DDI_PROP_SUCCESS) {
357			plen = sizeof (size32);
358			if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF,
359			    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS,
360			    "nblocks", (caddr_t)&size32, &plen) ==
361			    DDI_PROP_SUCCESS)
362				size = size32;
363		}
364
365		if (size != UNKNOWN_SIZE) {
366			/* convert from block size to byte size */
367			if (size < (MAXOFFSET_T >> DEV_BSHIFT))
368				size = size << DEV_BSHIFT;
369			else
370				size = UNKNOWN_SIZE;
371		}
372	}
373
374	csp->s_size = size;
375	csp->s_flag |= SSIZEVALID;
376
377	mutex_exit(&csp->s_lock);
378	return (size);
379}
380
381/*
382 * This function deal with vnode substitution in the case of
383 * device cloning.
384 */
385static int
386spec_clone(struct vnode **vpp, dev_t newdev, int vtype, struct stdata *stp)
387{
388	dev_t		dev = (*vpp)->v_rdev;
389	major_t		maj = getmajor(dev);
390	major_t 	newmaj = getmajor(newdev);
391	int		sysclone = (maj == clone_major);
392	int		qassociate_used = 0;
393	struct snode	*oldsp, *oldcsp;
394	struct snode	*newsp, *newcsp;
395	struct vnode	*newvp, *newcvp;
396	dev_info_t	*dip;
397	queue_t		*dq;
398
399	ASSERT(dev != newdev);
400
401	/*
402	 * Check for cloning across different drivers.
403	 * We only support this under the system provided clone driver
404	 */
405	if ((maj != newmaj) && !sysclone) {
406		cmn_err(CE_NOTE,
407		    "unsupported clone open maj = %u, newmaj = %u",
408		    maj, newmaj);
409		return (ENXIO);
410	}
411
412	/* old */
413	oldsp = VTOS(*vpp);
414	oldcsp = VTOS(oldsp->s_commonvp);
415
416	/* new */
417	newvp = makespecvp(newdev, vtype);
418	ASSERT(newvp != NULL);
419	newsp = VTOS(newvp);
420	newcvp = newsp->s_commonvp;
421	newcsp = VTOS(newcvp);
422
423	/*
424	 * Clones inherit fsid, realvp, and dip.
425	 * XXX realvp inherit is not occurring, does fstat of clone work?
426	 */
427	newsp->s_fsid = oldsp->s_fsid;
428	if (sysclone) {
429		newsp->s_flag |= SCLONE;
430		dip = NULL;
431	} else {
432		newsp->s_flag |= SSELFCLONE;
433		dip = oldcsp->s_dip;
434	}
435
436	/*
437	 * If we cloned to an opened newdev that already has called
438	 * spec_assoc_vp_with_devi (SDIPSET set) then the association is
439	 * already established.
440	 */
441	if (!(newcsp->s_flag & SDIPSET)) {
442		/*
443		 * Establish s_dip association for newdev.
444		 *
445		 * If we trusted the getinfo(9E) DDI_INFO_DEVT2INSTANCE
446		 * implementation of all cloning drivers  (SCLONE and SELFCLONE)
447		 * we would always use e_ddi_hold_devi_by_dev().  We know that
448		 * many drivers have had (still have?) problems with
449		 * DDI_INFO_DEVT2INSTANCE, so we try to minimize reliance by
450		 * detecting drivers that use QASSOCIATE (by looking down the
451		 * stream) and setting their s_dip association to NULL.
452		 */
453		qassociate_used = 0;
454		if (stp) {
455			for (dq = stp->sd_wrq; dq; dq = dq->q_next) {
456				if (_RD(dq)->q_flag & _QASSOCIATED) {
457					qassociate_used = 1;
458					dip = NULL;
459					break;
460				}
461			}
462		}
463
464		if (dip || qassociate_used) {
465			spec_assoc_vp_with_devi(newvp, dip);
466		} else {
467			/* derive association from newdev */
468			dip = e_ddi_hold_devi_by_dev(newdev, 0);
469			spec_assoc_vp_with_devi(newvp, dip);
470			if (dip)
471				ddi_release_devi(dip);
472		}
473	}
474
475	SN_HOLD(newcsp);
476
477	/* deal with stream stuff */
478	if (stp != NULL) {
479		LOCK_CSP(newcsp);	/* synchronize stream open/close */
480		mutex_enter(&newcsp->s_lock);
481		newcvp->v_stream = newvp->v_stream = stp;
482		stp->sd_vnode = newcvp;
483		stp->sd_strtab = STREAMSTAB(newmaj);
484		mutex_exit(&newcsp->s_lock);
485		UNLOCK_CSP(newcsp);
486	}
487
488	/* substitute the vnode */
489	SN_RELE(oldcsp);
490	VN_RELE(*vpp);
491	*vpp = newvp;
492
493	return (0);
494}
495
496static int
497spec_open(struct vnode **vpp, int flag, struct cred *cr)
498{
499	major_t maj;
500	dev_t dev, newdev;
501	struct vnode *vp, *cvp;
502	struct snode *sp, *csp;
503	struct stdata *stp;
504	dev_info_t *dip;
505	int error, type;
506
507	flag &= ~FCREAT;		/* paranoia */
508
509	vp = *vpp;
510	sp = VTOS(vp);
511	ASSERT((vp->v_type == VCHR) || (vp->v_type == VBLK));
512	if ((vp->v_type != VCHR) && (vp->v_type != VBLK))
513		return (ENXIO);
514
515	/*
516	 * If the VFS_NODEVICES bit was set for the mount,
517	 * do not allow opens of special devices.
518	 */
519	if (sp->s_realvp && (sp->s_realvp->v_vfsp->vfs_flag & VFS_NODEVICES))
520		return (ENXIO);
521
522	newdev = dev = vp->v_rdev;
523
524	/*
525	 * If we are opening a node that has not had spec_assoc_vp_with_devi
526	 * called against it (mknod outside /devices or a non-dacf makespecvp
527	 * node) then SDIPSET will not be set. In this case we call an
528	 * interface which will reconstruct the path and lookup (drive attach)
529	 * through devfs (e_ddi_hold_devi_by_dev -> e_ddi_hold_devi_by_path ->
530	 * devfs_lookupname).  For support of broken drivers that don't call
531	 * ddi_create_minor_node for all minor nodes in their instance space,
532	 * we call interfaces that operates at the directory/devinfo
533	 * (major/instance) level instead of to the leaf/minor node level.
534	 * After finding and attaching the dip we associate it with the
535	 * common specfs vnode (s_dip), which sets SDIPSET.  A DL_DETACH_REQ
536	 * to style-2 stream driver may set s_dip to NULL with SDIPSET set.
537	 *
538	 * NOTE: Although e_ddi_hold_devi_by_dev takes a dev_t argument, its
539	 * implementation operates at the major/instance level since it only
540	 * need to return a dip.
541	 */
542	cvp = sp->s_commonvp;
543	csp = VTOS(cvp);
544	if (!(csp->s_flag & SDIPSET)) {
545		/* try to attach, return error if we fail */
546		if ((dip = e_ddi_hold_devi_by_dev(dev, 0)) == NULL)
547			return (ENXIO);
548
549		/* associate dip with the common snode s_dip */
550		spec_assoc_vp_with_devi(vp, dip);
551		ddi_release_devi(dip);	/* from e_ddi_hold_devi_by_dev */
552	}
553
554#ifdef  DEBUG
555	/* verify attach/open exclusion guarantee */
556	dip = csp->s_dip;
557	ASSERT((dip == NULL) || i_ddi_devi_attached(dip));
558#endif  /* DEBUG */
559
560	if ((error = secpolicy_spec_open(cr, cvp, flag)) != 0)
561		return (error);
562
563	maj = getmajor(dev);
564	if (STREAMSTAB(maj))
565		goto streams_open;
566
567	SN_HOLD(csp);			/* increment open count */
568
569	/* non streams open */
570	type = (vp->v_type == VBLK ? OTYP_BLK : OTYP_CHR);
571	error = dev_open(&newdev, flag, type, cr);
572
573	/* deal with clone case */
574	if (error == 0 && dev != newdev) {
575		error = spec_clone(vpp, newdev, vp->v_type, NULL);
576		/*
577		 * bail on clone failure, further processing
578		 * results in undefined behaviors.
579		 */
580		if (error != 0)
581			return (error);
582		sp = VTOS(*vpp);
583		csp = VTOS(sp->s_commonvp);
584	}
585
586	if (error == 0) {
587		sp->s_size = SPEC_SIZE(csp);
588
589		if ((csp->s_flag & SNEEDCLOSE) == 0) {
590			int nmaj = getmajor(newdev);
591			mutex_enter(&csp->s_lock);
592			/* successful open needs a close later */
593			csp->s_flag |= SNEEDCLOSE;
594
595			/*
596			 * Invalidate possible cached "unknown" size
597			 * established by a VOP_GETATTR while open was in
598			 * progress, and the driver might fail prop_op(9E).
599			 */
600			if (((cvp->v_type == VCHR) && (csp->s_size == 0)) ||
601			    ((cvp->v_type == VBLK) &&
602			    (csp->s_size == UNKNOWN_SIZE)))
603				csp->s_flag &= ~SSIZEVALID;
604
605			if (devopsp[nmaj]->devo_cb_ops->cb_flag & D_64BIT)
606				csp->s_flag |= SLOFFSET;
607			if (devopsp[nmaj]->devo_cb_ops->cb_flag & D_U64BIT)
608				csp->s_flag |= SLOFFSET | SANYOFFSET;
609			mutex_exit(&csp->s_lock);
610		}
611		return (0);
612	}
613
614	/*
615	 * Open failed. If we missed a close operation because
616	 * we were trying to get the device open and it is the
617	 * last in progress open that is failing then call close.
618	 *
619	 * NOTE: Only non-streams open has this race condition.
620	 */
621	mutex_enter(&csp->s_lock);
622	csp->s_count--;			/* decrement open count : SN_RELE */
623	if ((csp->s_count == 0) &&	/* no outstanding open */
624	    (csp->s_mapcnt == 0) &&	/* no mapping */
625	    (csp->s_flag & SNEEDCLOSE)) { /* need a close */
626		csp->s_flag &= ~(SNEEDCLOSE | SSIZEVALID);
627
628		/* See comment in spec_close() */
629		if (csp->s_flag & (SCLONE | SSELFCLONE))
630			csp->s_flag &= ~SDIPSET;
631
632		mutex_exit(&csp->s_lock);
633		ASSERT(*vpp != NULL);
634		(void) device_close(*vpp, flag, cr);
635	} else {
636		mutex_exit(&csp->s_lock);
637	}
638	return (error);
639
640streams_open:
641	if (vp->v_type != VCHR)
642		return (ENXIO);
643
644	/*
645	 * Lock common snode to prevent any new clone opens
646	 * on this stream while one is in progress.
647	 * This is necessary since the stream currently
648	 * associated with the clone device will not be part
649	 * of it after the clone open completes.
650	 * Unfortunately we don't know in advance if this is
651	 * a clone device so we have to lock all opens.
652	 *
653	 * If we fail, it's because of an interrupt.
654	 */
655	if (LOCK_CSP_SIG(csp) == 0)
656		return (EINTR);
657
658	SN_HOLD(csp);			/* increment open count */
659
660	error = stropen(cvp, &newdev, flag, cr);
661	stp = cvp->v_stream;
662
663	/* deal with the clone case */
664	if ((error == 0) && (dev != newdev)) {
665		vp->v_stream = cvp->v_stream = NULL;
666		UNLOCK_CSP(csp);
667		error = spec_clone(vpp, newdev, vp->v_type, stp);
668		/*
669		 * bail on clone failure, further processing
670		 * results in undefined behaviors.
671		 */
672		if (error != 0)
673			return (error);
674		sp = VTOS(*vpp);
675		csp = VTOS(sp->s_commonvp);
676	} else if (error == 0) {
677		vp->v_stream = stp;
678		UNLOCK_CSP(csp);
679	}
680
681	if (error == 0) {
682		/* STREAMS devices don't have a size */
683		sp->s_size = csp->s_size = 0;
684
685		if (!(stp->sd_flag & STRISTTY) || (flag & FNOCTTY))
686			return (0);
687
688		/* try to allocate it as a controlling terminal */
689		if (strctty(stp) != EINTR)
690			return (0);
691
692		/* strctty() was interrupted by a signal */
693		(void) spec_close(vp, flag, 1, 0, cr);
694		return (EINTR);
695	}
696
697	/*
698	 * Deal with stropen failure.
699	 *
700	 * sd_flag in the stream head cannot change since the
701	 * common snode is locked before the call to stropen().
702	 */
703	if ((stp != NULL) && (stp->sd_flag & STREOPENFAIL)) {
704		/*
705		 * Open failed part way through.
706		 */
707		mutex_enter(&stp->sd_lock);
708		stp->sd_flag &= ~STREOPENFAIL;
709		mutex_exit(&stp->sd_lock);
710
711		UNLOCK_CSP(csp);
712		(void) spec_close(vp, flag, 1, 0, cr);
713	} else {
714		UNLOCK_CSP(csp);
715		SN_RELE(csp);
716	}
717
718	return (error);
719}
720
721/*ARGSUSED2*/
722static int
723spec_close(
724	struct vnode	*vp,
725	int		flag,
726	int		count,
727	offset_t	offset,
728	struct cred	*cr)
729{
730	struct vnode *cvp;
731	struct snode *sp, *csp;
732	enum vtype type;
733	dev_t dev;
734	int error = 0;
735	int sysclone;
736
737	if (!(flag & FKLYR)) {
738		/* this only applies to closes of devices from userland */
739		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
740		cleanshares(vp, ttoproc(curthread)->p_pid);
741		if (vp->v_stream)
742			strclean(vp);
743	}
744	if (count > 1)
745		return (0);
746
747	sp = VTOS(vp);
748	cvp = sp->s_commonvp;
749
750	dev = sp->s_dev;
751	type = vp->v_type;
752
753	ASSERT(type == VCHR || type == VBLK);
754
755	/*
756	 * Prevent close/close and close/open races by serializing closes
757	 * on this common snode. Clone opens are held up until after
758	 * we have closed this device so the streams linkage is maintained
759	 */
760	csp = VTOS(cvp);
761
762	LOCK_CSP(csp);
763	mutex_enter(&csp->s_lock);
764
765	csp->s_count--;			/* one fewer open reference : SN_RELE */
766	sysclone = sp->s_flag & SCLONE;
767
768	/*
769	 * Invalidate size on each close.
770	 *
771	 * XXX We do this on each close because we don't have interfaces that
772	 * allow a driver to invalidate the size.  Since clearing this on each
773	 * close this causes property overhead we skip /dev/null and
774	 * /dev/zero to avoid degrading kenbus performance.
775	 */
776	if (getmajor(dev) != mm_major)
777		csp->s_flag &= ~SSIZEVALID;
778
779	/*
780	 * Only call the close routine when the last open reference through
781	 * any [s, v]node goes away.  This can be checked by looking at
782	 * s_count on the common vnode.
783	 */
784	if ((csp->s_count == 0) && (csp->s_mapcnt == 0)) {
785		/* we don't need a close */
786		csp->s_flag &= ~(SNEEDCLOSE | SSIZEVALID);
787
788		/*
789		 * A cloning driver may open-clone to the same dev_t that we
790		 * are closing before spec_inactive destroys the common snode.
791		 * If this occurs the s_dip association needs to be reevaluated.
792		 * We clear SDIPSET to force reevaluation in this case.  When
793		 * reevaluation occurs (by spec_clone after open), if the
794		 * devinfo association has changed then the old association
795		 * will be released as the new association is established by
796		 * spec_assoc_vp_with_devi().
797		 */
798		if (csp->s_flag & (SCLONE | SSELFCLONE))
799			csp->s_flag &= ~SDIPSET;
800
801		mutex_exit(&csp->s_lock);
802		error = device_close(vp, flag, cr);
803
804		/*
805		 * Decrement the devops held in clnopen()
806		 */
807		if (sysclone) {
808			ddi_rele_driver(getmajor(dev));
809		}
810		mutex_enter(&csp->s_lock);
811	}
812
813	UNLOCK_CSP_LOCK_HELD(csp);
814	mutex_exit(&csp->s_lock);
815
816	return (error);
817}
818
819/*ARGSUSED2*/
820static int
821spec_read(
822	struct vnode	*vp,
823	struct uio	*uiop,
824	int		ioflag,
825	struct cred	*cr,
826	struct caller_context *ct)
827{
828	int error;
829	struct snode *sp = VTOS(vp);
830	dev_t dev = sp->s_dev;
831	size_t n;
832	ulong_t on;
833	u_offset_t bdevsize;
834	offset_t maxoff;
835	offset_t off;
836	struct vnode *blkvp;
837
838	ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
839
840	if (STREAMSTAB(getmajor(dev))) {	/* stream */
841		ASSERT(vp->v_type == VCHR);
842		smark(sp, SACC);
843		return (strread(vp, uiop, cr));
844	}
845
846	if (uiop->uio_resid == 0)
847		return (0);
848
849	/*
850	 * Plain old character devices that set D_U64BIT can have
851	 * unrestricted offsets.
852	 */
853	maxoff = spec_maxoffset(vp);
854	ASSERT(maxoff != -1 || vp->v_type == VCHR);
855
856	if (maxoff != -1 && (uiop->uio_loffset < 0 ||
857	    uiop->uio_loffset + uiop->uio_resid > maxoff))
858		return (EINVAL);
859
860	if (vp->v_type == VCHR) {
861		smark(sp, SACC);
862		ASSERT(STREAMSTAB(getmajor(dev)) == 0);
863		return (cdev_read(dev, uiop, cr));
864	}
865
866	/*
867	 * Block device.
868	 */
869	error = 0;
870	blkvp = sp->s_commonvp;
871	bdevsize = SPEC_SIZE(VTOS(blkvp));
872
873	do {
874		caddr_t base;
875		offset_t diff;
876
877		off = uiop->uio_loffset & (offset_t)MAXBMASK;
878		on = (size_t)(uiop->uio_loffset & MAXBOFFSET);
879		n = (size_t)MIN(MAXBSIZE - on, uiop->uio_resid);
880		diff = bdevsize - uiop->uio_loffset;
881
882		if (diff <= 0)
883			break;
884		if (diff < n)
885			n = (size_t)diff;
886
887		if (vpm_enable) {
888			error = vpm_data_copy(blkvp, (u_offset_t)(off + on),
889				n, uiop, 1, NULL, 0, S_READ);
890		} else {
891			base = segmap_getmapflt(segkmap, blkvp,
892				(u_offset_t)(off + on), n, 1, S_READ);
893
894			error = uiomove(base + on, n, UIO_READ, uiop);
895		}
896		if (!error) {
897			int flags = 0;
898			/*
899			 * If we read a whole block, we won't need this
900			 * buffer again soon.
901			 */
902			if (n + on == MAXBSIZE)
903				flags = SM_DONTNEED | SM_FREE;
904			if (vpm_enable) {
905				error = vpm_sync_pages(blkvp, off, n, flags);
906			} else {
907				error = segmap_release(segkmap, base, flags);
908			}
909		} else {
910			if (vpm_enable) {
911				(void) vpm_sync_pages(blkvp, off, n, 0);
912			} else {
913				(void) segmap_release(segkmap, base, 0);
914			}
915			if (bdevsize == UNKNOWN_SIZE) {
916				error = 0;
917				break;
918			}
919		}
920	} while (error == 0 && uiop->uio_resid > 0 && n != 0);
921
922	return (error);
923}
924
925/*ARGSUSED*/
926static int
927spec_write(
928	struct vnode *vp,
929	struct uio *uiop,
930	int ioflag,
931	struct cred *cr,
932	struct caller_context *ct)
933{
934	int error;
935	struct snode *sp = VTOS(vp);
936	dev_t dev = sp->s_dev;
937	size_t n;
938	ulong_t on;
939	u_offset_t bdevsize;
940	offset_t maxoff;
941	offset_t off;
942	struct vnode *blkvp;
943
944	ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
945
946	if (STREAMSTAB(getmajor(dev))) {
947		ASSERT(vp->v_type == VCHR);
948		smark(sp, SUPD);
949		return (strwrite(vp, uiop, cr));
950	}
951
952	/*
953	 * Plain old character devices that set D_U64BIT can have
954	 * unrestricted offsets.
955	 */
956	maxoff = spec_maxoffset(vp);
957	ASSERT(maxoff != -1 || vp->v_type == VCHR);
958
959	if (maxoff != -1 && (uiop->uio_loffset < 0 ||
960	    uiop->uio_loffset + uiop->uio_resid > maxoff))
961		return (EINVAL);
962
963	if (vp->v_type == VCHR) {
964		smark(sp, SUPD);
965		ASSERT(STREAMSTAB(getmajor(dev)) == 0);
966		return (cdev_write(dev, uiop, cr));
967	}
968
969	if (uiop->uio_resid == 0)
970		return (0);
971
972	error = 0;
973	blkvp = sp->s_commonvp;
974	bdevsize = SPEC_SIZE(VTOS(blkvp));
975
976	do {
977		int pagecreate;
978		int newpage;
979		caddr_t base;
980		offset_t diff;
981
982		off = uiop->uio_loffset & (offset_t)MAXBMASK;
983		on = (ulong_t)(uiop->uio_loffset & MAXBOFFSET);
984		n = (size_t)MIN(MAXBSIZE - on, uiop->uio_resid);
985		pagecreate = 0;
986
987		diff = bdevsize - uiop->uio_loffset;
988		if (diff <= 0) {
989			error = ENXIO;
990			break;
991		}
992		if (diff < n)
993			n = (size_t)diff;
994
995		/*
996		 * Check to see if we can skip reading in the page
997		 * and just allocate the memory.  We can do this
998		 * if we are going to rewrite the entire mapping
999		 * or if we are going to write to end of the device
1000		 * from the beginning of the mapping.
1001		 */
1002		if (n == MAXBSIZE || (on == 0 && (off + n) == bdevsize))
1003			pagecreate = 1;
1004
1005		newpage = 0;
1006		if (vpm_enable) {
1007			error = vpm_data_copy(blkvp, (u_offset_t)(off + on),
1008				n, uiop, !pagecreate, NULL, 0, S_WRITE);
1009		} else {
1010			base = segmap_getmapflt(segkmap, blkvp,
1011			    (u_offset_t)(off + on), n, !pagecreate, S_WRITE);
1012
1013			/*
1014			 * segmap_pagecreate() returns 1 if it calls
1015			 * page_create_va() to allocate any pages.
1016			 */
1017
1018			if (pagecreate)
1019				newpage = segmap_pagecreate(segkmap, base + on,
1020					n, 0);
1021
1022			error = uiomove(base + on, n, UIO_WRITE, uiop);
1023		}
1024
1025		if (!vpm_enable && pagecreate &&
1026		    uiop->uio_loffset <
1027		    P2ROUNDUP_TYPED(off + on + n, PAGESIZE, offset_t)) {
1028			/*
1029			 * We created pages w/o initializing them completely,
1030			 * thus we need to zero the part that wasn't set up.
1031			 * This can happen if we write to the end of the device
1032			 * or if we had some sort of error during the uiomove.
1033			 */
1034			long nzero;
1035			offset_t nmoved;
1036
1037			nmoved = (uiop->uio_loffset - (off + on));
1038			if (nmoved < 0 || nmoved > n) {
1039				panic("spec_write: nmoved bogus");
1040				/*NOTREACHED*/
1041			}
1042			nzero = (long)P2ROUNDUP(on + n, PAGESIZE) -
1043			    (on + nmoved);
1044			if (nzero < 0 || (on + nmoved + nzero > MAXBSIZE)) {
1045				panic("spec_write: nzero bogus");
1046				/*NOTREACHED*/
1047			}
1048			(void) kzero(base + on + nmoved, (size_t)nzero);
1049		}
1050
1051		/*
1052		 * Unlock the pages which have been allocated by
1053		 * page_create_va() in segmap_pagecreate().
1054		 */
1055		if (!vpm_enable && newpage)
1056			segmap_pageunlock(segkmap, base + on,
1057				(size_t)n, S_WRITE);
1058
1059		if (error == 0) {
1060			int flags = 0;
1061
1062			/*
1063			 * Force write back for synchronous write cases.
1064			 */
1065			if (ioflag & (FSYNC|FDSYNC))
1066				flags = SM_WRITE;
1067			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
1068				/*
1069				 * Have written a whole block.
1070				 * Start an asynchronous write and
1071				 * mark the buffer to indicate that
1072				 * it won't be needed again soon.
1073				 * Push swap files here, since it
1074				 * won't happen anywhere else.
1075				 */
1076				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
1077			}
1078			smark(sp, SUPD|SCHG);
1079			if (vpm_enable) {
1080				error = vpm_sync_pages(blkvp, off, n, flags);
1081			} else {
1082				error = segmap_release(segkmap, base, flags);
1083			}
1084		} else {
1085			if (vpm_enable) {
1086				(void) vpm_sync_pages(blkvp, off, n, SM_INVAL);
1087			} else {
1088				(void) segmap_release(segkmap, base, SM_INVAL);
1089			}
1090		}
1091
1092	} while (error == 0 && uiop->uio_resid > 0 && n != 0);
1093
1094	return (error);
1095}
1096
1097static int
1098spec_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, struct cred *cr,
1099    int *rvalp)
1100{
1101	struct snode *sp;
1102	dev_t dev;
1103	int error;
1104
1105	if (vp->v_type != VCHR)
1106		return (ENOTTY);
1107	sp = VTOS(vp);
1108	dev = sp->s_dev;
1109	if (STREAMSTAB(getmajor(dev))) {
1110		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
1111	} else {
1112		error = cdev_ioctl(dev, cmd, arg, mode, cr, rvalp);
1113	}
1114	return (error);
1115}
1116
1117static int
1118spec_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr)
1119{
1120	int error;
1121	struct snode *sp;
1122	struct vnode *realvp;
1123
1124	/* With ATTR_COMM we will not get attributes from realvp */
1125	if (flags & ATTR_COMM) {
1126		sp = VTOS(vp);
1127		vp = sp->s_commonvp;
1128	}
1129	sp = VTOS(vp);
1130	realvp = sp->s_realvp;
1131
1132	if (realvp == NULL) {
1133		static int snode_shift	= 0;
1134
1135		/*
1136		 * Calculate the amount of bitshift to a snode pointer which
1137		 * will still keep it unique.  See below.
1138		 */
1139		if (snode_shift == 0)
1140			snode_shift = highbit(sizeof (struct snode));
1141		ASSERT(snode_shift > 0);
1142
1143		/*
1144		 * No real vnode behind this one.  Fill in the fields
1145		 * from the snode.
1146		 *
1147		 * This code should be refined to return only the
1148		 * attributes asked for instead of all of them.
1149		 */
1150		vap->va_type = vp->v_type;
1151		vap->va_mode = 0;
1152		vap->va_uid = vap->va_gid = 0;
1153		vap->va_fsid = sp->s_fsid;
1154
1155		/*
1156		 * If the va_nodeid is > MAX_USHORT, then i386 stats might
1157		 * fail. So we shift down the snode pointer to try and get
1158		 * the most uniqueness into 16-bits.
1159		 */
1160		vap->va_nodeid = ((ino64_t)(uintptr_t)sp >> snode_shift) &
1161		    0xFFFF;
1162		vap->va_nlink = 0;
1163		vap->va_rdev = sp->s_dev;
1164
1165		/*
1166		 * va_nblocks is the number of 512 byte blocks used to store
1167		 * the mknod for the device, not the number of blocks on the
1168		 * device itself.  This is typically zero since the mknod is
1169		 * represented directly in the inode itself.
1170		 */
1171		vap->va_nblocks = 0;
1172	} else {
1173		error = VOP_GETATTR(realvp, vap, flags, cr);
1174		if (error != 0)
1175			return (error);
1176	}
1177
1178	/* set the size from the snode */
1179	vap->va_size = SPEC_SIZE(VTOS(sp->s_commonvp));
1180	vap->va_blksize = MAXBSIZE;
1181
1182	mutex_enter(&sp->s_lock);
1183	vap->va_atime.tv_sec = sp->s_atime;
1184	vap->va_mtime.tv_sec = sp->s_mtime;
1185	vap->va_ctime.tv_sec = sp->s_ctime;
1186	mutex_exit(&sp->s_lock);
1187
1188	vap->va_atime.tv_nsec = 0;
1189	vap->va_mtime.tv_nsec = 0;
1190	vap->va_ctime.tv_nsec = 0;
1191	vap->va_seq = 0;
1192
1193	return (0);
1194}
1195
1196static int
1197spec_setattr(
1198	struct vnode *vp,
1199	struct vattr *vap,
1200	int flags,
1201	struct cred *cr,
1202	caller_context_t *ctp)
1203{
1204	struct snode *sp = VTOS(vp);
1205	struct vnode *realvp;
1206	int error;
1207
1208	if (vp->v_type == VCHR && vp->v_stream && (vap->va_mask & AT_SIZE)) {
1209		/*
1210		 * 1135080:	O_TRUNC should have no effect on
1211		 *		named pipes and terminal devices.
1212		 */
1213		ASSERT(vap->va_mask == AT_SIZE);
1214		return (0);
1215	}
1216
1217	if ((realvp = sp->s_realvp) == NULL)
1218		error = 0;	/* no real vnode to update */
1219	else
1220		error = VOP_SETATTR(realvp, vap, flags, cr, ctp);
1221	if (error == 0) {
1222		/*
1223		 * If times were changed, update snode.
1224		 */
1225		mutex_enter(&sp->s_lock);
1226		if (vap->va_mask & AT_ATIME)
1227			sp->s_atime = vap->va_atime.tv_sec;
1228		if (vap->va_mask & AT_MTIME) {
1229			sp->s_mtime = vap->va_mtime.tv_sec;
1230			sp->s_ctime = gethrestime_sec();
1231		}
1232		mutex_exit(&sp->s_lock);
1233	}
1234	return (error);
1235}
1236
1237static int
1238spec_access(struct vnode *vp, int mode, int flags, struct cred *cr)
1239{
1240	struct vnode *realvp;
1241	struct snode *sp = VTOS(vp);
1242
1243	if ((realvp = sp->s_realvp) != NULL)
1244		return (VOP_ACCESS(realvp, mode, flags, cr));
1245	else
1246		return (0);	/* Allow all access. */
1247}
1248
1249/*
1250 * This can be called if creat or an open with O_CREAT is done on the root
1251 * of a lofs mount where the mounted entity is a special file.
1252 */
1253/*ARGSUSED*/
1254static int
1255spec_create(struct vnode *dvp, char *name, vattr_t *vap, enum vcexcl excl,
1256    int mode, struct vnode **vpp, struct cred *cr, int flag)
1257{
1258	int error;
1259
1260	ASSERT(dvp && (dvp->v_flag & VROOT) && *name == '\0');
1261	if (excl == NONEXCL) {
1262		if (mode && (error = spec_access(dvp, mode, 0, cr)))
1263			return (error);
1264		VN_HOLD(dvp);
1265		return (0);
1266	}
1267	return (EEXIST);
1268}
1269
1270/*
1271 * In order to sync out the snode times without multi-client problems,
1272 * make sure the times written out are never earlier than the times
1273 * already set in the vnode.
1274 */
1275static int
1276spec_fsync(struct vnode *vp, int syncflag, struct cred *cr)
1277{
1278	struct snode *sp = VTOS(vp);
1279	struct vnode *realvp;
1280	struct vnode *cvp;
1281	struct vattr va, vatmp;
1282
1283	/* If times didn't change, don't flush anything. */
1284	mutex_enter(&sp->s_lock);
1285	if ((sp->s_flag & (SACC|SUPD|SCHG)) == 0 && vp->v_type != VBLK) {
1286		mutex_exit(&sp->s_lock);
1287		return (0);
1288	}
1289	sp->s_flag &= ~(SACC|SUPD|SCHG);
1290	mutex_exit(&sp->s_lock);
1291	cvp = sp->s_commonvp;
1292	realvp = sp->s_realvp;
1293
1294	if (vp->v_type == VBLK && cvp != vp && vn_has_cached_data(cvp) &&
1295	    (cvp->v_flag & VISSWAP) == 0)
1296		(void) VOP_PUTPAGE(cvp, (offset_t)0, 0, 0, cr);
1297
1298	/*
1299	 * For devices that support it, force write cache to stable storage.
1300	 * We don't need the lock to check s_flags since we can treat
1301	 * SNOFLUSH as a hint.
1302	 */
1303	if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
1304	    !(sp->s_flag & SNOFLUSH)) {
1305		int rval, rc;
1306		rc = cdev_ioctl(vp->v_rdev, DKIOCFLUSHWRITECACHE,
1307		    NULL, FNATIVE|FKIOCTL, cr, &rval);
1308		if (rc == ENOTSUP || rc == ENOTTY) {
1309			mutex_enter(&sp->s_lock);
1310			sp->s_flag |= SNOFLUSH;
1311			mutex_exit(&sp->s_lock);
1312		}
1313	}
1314
1315	/*
1316	 * If no real vnode to update, don't flush anything.
1317	 */
1318	if (realvp == NULL)
1319		return (0);
1320
1321	vatmp.va_mask = AT_ATIME|AT_MTIME;
1322	if (VOP_GETATTR(realvp, &vatmp, 0, cr) == 0) {
1323
1324		mutex_enter(&sp->s_lock);
1325		if (vatmp.va_atime.tv_sec > sp->s_atime)
1326			va.va_atime = vatmp.va_atime;
1327		else {
1328			va.va_atime.tv_sec = sp->s_atime;
1329			va.va_atime.tv_nsec = 0;
1330		}
1331		if (vatmp.va_mtime.tv_sec > sp->s_mtime)
1332			va.va_mtime = vatmp.va_mtime;
1333		else {
1334			va.va_mtime.tv_sec = sp->s_mtime;
1335			va.va_mtime.tv_nsec = 0;
1336		}
1337		mutex_exit(&sp->s_lock);
1338
1339		va.va_mask = AT_ATIME|AT_MTIME;
1340		(void) VOP_SETATTR(realvp, &va, 0, cr, NULL);
1341	}
1342	(void) VOP_FSYNC(realvp, syncflag, cr);
1343	return (0);
1344}
1345
1346/*ARGSUSED*/
1347static void
1348spec_inactive(struct vnode *vp, struct cred *cr)
1349{
1350	struct snode *sp = VTOS(vp);
1351	struct vnode *cvp;
1352	struct vnode *rvp;
1353
1354	/*
1355	 * If no one has reclaimed the vnode, remove from the
1356	 * cache now.
1357	 */
1358	if (vp->v_count < 1) {
1359		panic("spec_inactive: Bad v_count");
1360		/*NOTREACHED*/
1361	}
1362	mutex_enter(&stable_lock);
1363
1364	mutex_enter(&vp->v_lock);
1365	/*
1366	 * Drop the temporary hold by vn_rele now
1367	 */
1368	if (--vp->v_count != 0) {
1369		mutex_exit(&vp->v_lock);
1370		mutex_exit(&stable_lock);
1371		return;
1372	}
1373	mutex_exit(&vp->v_lock);
1374
1375	sdelete(sp);
1376	mutex_exit(&stable_lock);
1377
1378	/* We are the sole owner of sp now */
1379	cvp = sp->s_commonvp;
1380	rvp = sp->s_realvp;
1381
1382	if (rvp) {
1383		/*
1384		 * If the snode times changed, then update the times
1385		 * associated with the "realvp".
1386		 */
1387		if ((sp->s_flag & (SACC|SUPD|SCHG)) != 0) {
1388
1389			struct vattr va, vatmp;
1390
1391			mutex_enter(&sp->s_lock);
1392			sp->s_flag &= ~(SACC|SUPD|SCHG);
1393			mutex_exit(&sp->s_lock);
1394			vatmp.va_mask = AT_ATIME|AT_MTIME;
1395			/*
1396			 * The user may not own the device, but we
1397			 * want to update the attributes anyway.
1398			 */
1399			if (VOP_GETATTR(rvp, &vatmp, 0, kcred) == 0) {
1400				if (vatmp.va_atime.tv_sec > sp->s_atime)
1401					va.va_atime = vatmp.va_atime;
1402				else {
1403					va.va_atime.tv_sec = sp->s_atime;
1404					va.va_atime.tv_nsec = 0;
1405				}
1406				if (vatmp.va_mtime.tv_sec > sp->s_mtime)
1407					va.va_mtime = vatmp.va_mtime;
1408				else {
1409					va.va_mtime.tv_sec = sp->s_mtime;
1410					va.va_mtime.tv_nsec = 0;
1411				}
1412
1413				va.va_mask = AT_ATIME|AT_MTIME;
1414				(void) VOP_SETATTR(rvp, &va, 0, kcred, NULL);
1415			}
1416		}
1417	}
1418	ASSERT(!vn_has_cached_data(vp));
1419	vn_invalid(vp);
1420
1421	/* if we are sharing another file systems vfs, release it */
1422	if (vp->v_vfsp && (vp->v_vfsp != &spec_vfs))
1423		VFS_RELE(vp->v_vfsp);
1424
1425	/* if we have a realvp, release the realvp */
1426	if (rvp)
1427		VN_RELE(rvp);
1428
1429	/* if we have a common, release the common */
1430	if (cvp && (cvp != vp)) {
1431		VN_RELE(cvp);
1432#ifdef DEBUG
1433	} else if (cvp) {
1434		/*
1435		 * if this is the last reference to a common vnode, any
1436		 * associated stream had better have been closed
1437		 */
1438		ASSERT(cvp == vp);
1439		ASSERT(cvp->v_stream == NULL);
1440#endif /* DEBUG */
1441	}
1442
1443	/*
1444	 * if we have a hold on a devinfo node (established by
1445	 * spec_assoc_vp_with_devi), release the hold
1446	 */
1447	if (sp->s_dip)
1448		ddi_release_devi(sp->s_dip);
1449
1450	/*
1451	 * If we have an associated device policy, release it.
1452	 */
1453	if (sp->s_plcy != NULL)
1454		dpfree(sp->s_plcy);
1455
1456	/*
1457	 * If all holds on the devinfo node are through specfs/devfs
1458	 * and we just destroyed the last specfs node associated with the
1459	 * device, then the devinfo node reference count should now be
1460	 * zero.  We can't check this because there may be other holds
1461	 * on the node from non file system sources: ddi_hold_devi_by_instance
1462	 * for example.
1463	 */
1464	kmem_cache_free(snode_cache, sp);
1465}
1466
1467static int
1468spec_fid(struct vnode *vp, struct fid *fidp)
1469{
1470	struct vnode *realvp;
1471	struct snode *sp = VTOS(vp);
1472
1473	if ((realvp = sp->s_realvp) != NULL)
1474		return (VOP_FID(realvp, fidp));
1475	else
1476		return (EINVAL);
1477}
1478
1479/*ARGSUSED1*/
1480static int
1481spec_seek(struct vnode *vp, offset_t ooff, offset_t *noffp)
1482{
1483	offset_t maxoff = spec_maxoffset(vp);
1484
1485	if (maxoff == -1 || *noffp <= maxoff)
1486		return (0);
1487	else
1488		return (EINVAL);
1489}
1490
1491static int
1492spec_frlock(
1493	struct vnode *vp,
1494	int		cmd,
1495	struct flock64	*bfp,
1496	int		flag,
1497	offset_t	offset,
1498	struct flk_callback *flk_cbp,
1499	struct cred	*cr)
1500{
1501	struct snode *sp = VTOS(vp);
1502	struct snode *csp;
1503
1504	csp = VTOS(sp->s_commonvp);
1505	/*
1506	 * If file is being mapped, disallow frlock.
1507	 */
1508	if (csp->s_mapcnt > 0)
1509		return (EAGAIN);
1510
1511	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
1512}
1513
1514static int
1515spec_realvp(struct vnode *vp, struct vnode **vpp)
1516{
1517	struct vnode *rvp;
1518
1519	if ((rvp = VTOS(vp)->s_realvp) != NULL) {
1520		vp = rvp;
1521		if (VOP_REALVP(vp, &rvp) == 0)
1522			vp = rvp;
1523	}
1524
1525	*vpp = vp;
1526	return (0);
1527}
1528
1529/*
1530 * Return all the pages from [off..off + len] in block
1531 * or character device.
1532 */
1533static int
1534spec_getpage(
1535	struct vnode	*vp,
1536	offset_t	off,
1537	size_t		len,
1538	uint_t		*protp,
1539	page_t		*pl[],
1540	size_t		plsz,
1541	struct seg	*seg,
1542	caddr_t		addr,
1543	enum seg_rw	rw,
1544	struct cred	*cr)
1545{
1546	struct snode *sp = VTOS(vp);
1547	int err;
1548
1549	ASSERT(sp->s_commonvp == vp);
1550
1551	/*
1552	 * XXX	Given the above assertion, this might not do
1553	 *	what is wanted here.
1554	 */
1555	if (vp->v_flag & VNOMAP)
1556		return (ENOSYS);
1557	TRACE_4(TR_FAC_SPECFS, TR_SPECFS_GETPAGE,
1558		"specfs getpage:vp %p off %llx len %ld snode %p",
1559		vp, off, len, sp);
1560
1561	switch (vp->v_type) {
1562	case VBLK:
1563		if (protp != NULL)
1564			*protp = PROT_ALL;
1565
1566		if (((u_offset_t)off + len) > (SPEC_SIZE(sp) + PAGEOFFSET))
1567			return (EFAULT);	/* beyond EOF */
1568
1569		if (len <= PAGESIZE)
1570			err = spec_getapage(vp, (u_offset_t)off, len, protp, pl,
1571			    plsz, seg, addr, rw, cr);
1572		else
1573			err = pvn_getpages(spec_getapage, vp, (u_offset_t)off,
1574			    len, protp, pl, plsz, seg, addr, rw, cr);
1575		break;
1576
1577	case VCHR:
1578		cmn_err(CE_NOTE, "spec_getpage called for character device. "
1579		    "Check any non-ON consolidation drivers");
1580		err = 0;
1581		pl[0] = (page_t *)0;
1582		break;
1583
1584	default:
1585		panic("spec_getpage: bad v_type 0x%x", vp->v_type);
1586		/*NOTREACHED*/
1587	}
1588
1589	return (err);
1590}
1591
1592extern int klustsize;	/* set in machdep.c */
1593
1594int spec_ra = 1;
1595int spec_lostpage;	/* number of times we lost original page */
1596
1597/*ARGSUSED2*/
1598static int
1599spec_getapage(
1600	struct vnode *vp,
1601	u_offset_t	off,
1602	size_t		len,
1603	uint_t		*protp,
1604	page_t		*pl[],
1605	size_t		plsz,
1606	struct seg	*seg,
1607	caddr_t		addr,
1608	enum seg_rw	rw,
1609	struct cred	*cr)
1610{
1611	struct snode *sp;
1612	struct buf *bp;
1613	page_t *pp, *pp2;
1614	u_offset_t io_off1, io_off2;
1615	size_t io_len1;
1616	size_t io_len2;
1617	size_t blksz;
1618	u_offset_t blkoff;
1619	int dora, err;
1620	page_t *pagefound;
1621	uint_t xlen;
1622	size_t adj_klustsize;
1623	u_offset_t size;
1624	u_offset_t tmpoff;
1625
1626	sp = VTOS(vp);
1627	TRACE_3(TR_FAC_SPECFS, TR_SPECFS_GETAPAGE,
1628		"specfs getapage:vp %p off %llx snode %p", vp, off, sp);
1629reread:
1630
1631	err = 0;
1632	bp = NULL;
1633	pp = NULL;
1634	pp2 = NULL;
1635
1636	if (pl != NULL)
1637		pl[0] = NULL;
1638
1639	size = SPEC_SIZE(VTOS(sp->s_commonvp));
1640
1641	if (spec_ra && sp->s_nextr == off)
1642		dora = 1;
1643	else
1644		dora = 0;
1645
1646	if (size == UNKNOWN_SIZE) {
1647		dora = 0;
1648		adj_klustsize = PAGESIZE;
1649	} else {
1650		adj_klustsize = dora ? klustsize : PAGESIZE;
1651	}
1652
1653again:
1654	if ((pagefound = page_exists(vp, off)) == NULL) {
1655		if (rw == S_CREATE) {
1656			/*
1657			 * We're allocating a swap slot and it's
1658			 * associated page was not found, so allocate
1659			 * and return it.
1660			 */
1661			if ((pp = page_create_va(vp, off,
1662			    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1663				panic("spec_getapage: page_create");
1664				/*NOTREACHED*/
1665			}
1666			io_len1 = PAGESIZE;
1667			sp->s_nextr = off + PAGESIZE;
1668		} else {
1669			/*
1670			 * Need to really do disk I/O to get the page(s).
1671			 */
1672			blkoff = (off / adj_klustsize) * adj_klustsize;
1673			if (size == UNKNOWN_SIZE) {
1674				blksz = PAGESIZE;
1675			} else {
1676				if (blkoff + adj_klustsize <= size)
1677					blksz = adj_klustsize;
1678				else
1679					blksz =
1680					    MIN(size - blkoff, adj_klustsize);
1681			}
1682
1683			pp = pvn_read_kluster(vp, off, seg, addr, &tmpoff,
1684			    &io_len1, blkoff, blksz, 0);
1685			io_off1 = tmpoff;
1686			/*
1687			 * Make sure the page didn't sneek into the
1688			 * cache while we blocked in pvn_read_kluster.
1689			 */
1690			if (pp == NULL)
1691				goto again;
1692
1693			/*
1694			 * Zero part of page which we are not
1695			 * going to be reading from disk now.
1696			 */
1697			xlen = (uint_t)(io_len1 & PAGEOFFSET);
1698			if (xlen != 0)
1699				pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
1700
1701			bp = spec_startio(vp, pp, io_off1, io_len1,
1702			    pl == NULL ? (B_ASYNC | B_READ) : B_READ);
1703			sp->s_nextr = io_off1 + io_len1;
1704		}
1705	}
1706
1707	if (dora && rw != S_CREATE) {
1708		u_offset_t off2;
1709		caddr_t addr2;
1710
1711		off2 = ((off / adj_klustsize) + 1) * adj_klustsize;
1712		addr2 = addr + (off2 - off);
1713
1714		pp2 = NULL;
1715		/*
1716		 * If we are past EOF then don't bother trying
1717		 * with read-ahead.
1718		 */
1719		if (off2 >= size)
1720			pp2 = NULL;
1721		else {
1722			if (off2 + adj_klustsize <= size)
1723				blksz = adj_klustsize;
1724			else
1725				blksz = MIN(size - off2, adj_klustsize);
1726
1727			pp2 = pvn_read_kluster(vp, off2, seg, addr2, &tmpoff,
1728			    &io_len2, off2, blksz, 1);
1729			io_off2 = tmpoff;
1730		}
1731
1732		if (pp2 != NULL) {
1733			/*
1734			 * Zero part of page which we are not
1735			 * going to be reading from disk now.
1736			 */
1737			xlen = (uint_t)(io_len2 & PAGEOFFSET);
1738			if (xlen != 0)
1739				pagezero(pp2->p_prev, xlen, PAGESIZE - xlen);
1740
1741			(void) spec_startio(vp, pp2, io_off2, io_len2,
1742			    B_READ | B_ASYNC);
1743		}
1744	}
1745
1746	if (pl == NULL)
1747		return (err);
1748
1749	if (bp != NULL) {
1750		err = biowait(bp);
1751		pageio_done(bp);
1752
1753		if (err) {
1754			if (pp != NULL)
1755				pvn_read_done(pp, B_ERROR);
1756			return (err);
1757		}
1758	}
1759
1760	if (pagefound) {
1761		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
1762		/*
1763		 * Page exists in the cache, acquire the appropriate
1764		 * lock.  If this fails, start all over again.
1765		 */
1766
1767		if ((pp = page_lookup(vp, off, se)) == NULL) {
1768			spec_lostpage++;
1769			goto reread;
1770		}
1771		pl[0] = pp;
1772		pl[1] = NULL;
1773
1774		sp->s_nextr = off + PAGESIZE;
1775		return (0);
1776	}
1777
1778	if (pp != NULL)
1779		pvn_plist_init(pp, pl, plsz, off, io_len1, rw);
1780	return (0);
1781}
1782
1783/*
1784 * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED, B_FORCE}.
1785 * If len == 0, do from off to EOF.
1786 *
1787 * The normal cases should be len == 0 & off == 0 (entire vp list),
1788 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
1789 * (from pageout).
1790 */
1791int
1792spec_putpage(
1793	struct vnode *vp,
1794	offset_t	off,
1795	size_t		len,
1796	int		flags,
1797	struct cred	*cr)
1798{
1799	struct snode *sp = VTOS(vp);
1800	struct vnode *cvp;
1801	page_t *pp;
1802	u_offset_t io_off;
1803	size_t io_len = 0;	/* for lint */
1804	int err = 0;
1805	u_offset_t size;
1806	u_offset_t tmpoff;
1807
1808	ASSERT(vp->v_count != 0);
1809
1810	if (vp->v_flag & VNOMAP)
1811		return (ENOSYS);
1812
1813	cvp = sp->s_commonvp;
1814	size = SPEC_SIZE(VTOS(cvp));
1815
1816	if (!vn_has_cached_data(vp) || off >= size)
1817		return (0);
1818
1819	ASSERT(vp->v_type == VBLK && cvp == vp);
1820	TRACE_4(TR_FAC_SPECFS, TR_SPECFS_PUTPAGE,
1821		"specfs putpage:vp %p off %llx len %ld snode %p",
1822		vp, off, len, sp);
1823
1824	if (len == 0) {
1825		/*
1826		 * Search the entire vp list for pages >= off.
1827		 */
1828		err = pvn_vplist_dirty(vp, off, spec_putapage,
1829		    flags, cr);
1830	} else {
1831		u_offset_t eoff;
1832
1833		/*
1834		 * Loop over all offsets in the range [off...off + len]
1835		 * looking for pages to deal with.  We set limits so
1836		 * that we kluster to klustsize boundaries.
1837		 */
1838		eoff = off + len;
1839		for (io_off = off; io_off < eoff && io_off < size;
1840		    io_off += io_len) {
1841			/*
1842			 * If we are not invalidating, synchronously
1843			 * freeing or writing pages use the routine
1844			 * page_lookup_nowait() to prevent reclaiming
1845			 * them from the free list.
1846			 */
1847			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
1848				pp = page_lookup(vp, io_off,
1849					(flags & (B_INVAL | B_FREE)) ?
1850					    SE_EXCL : SE_SHARED);
1851			} else {
1852				pp = page_lookup_nowait(vp, io_off,
1853					(flags & B_FREE) ? SE_EXCL : SE_SHARED);
1854			}
1855
1856			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
1857				io_len = PAGESIZE;
1858			else {
1859				err = spec_putapage(vp, pp, &tmpoff, &io_len,
1860				    flags, cr);
1861				io_off = tmpoff;
1862				if (err != 0)
1863					break;
1864				/*
1865				 * "io_off" and "io_len" are returned as
1866				 * the range of pages we actually wrote.
1867				 * This allows us to skip ahead more quickly
1868				 * since several pages may've been dealt
1869				 * with by this iteration of the loop.
1870				 */
1871			}
1872		}
1873	}
1874	return (err);
1875}
1876
1877
1878/*
1879 * Write out a single page, possibly klustering adjacent
1880 * dirty pages.
1881 */
1882/*ARGSUSED5*/
1883static int
1884spec_putapage(
1885	struct vnode	*vp,
1886	page_t		*pp,
1887	u_offset_t	*offp,		/* return value */
1888	size_t		*lenp,		/* return value */
1889	int		flags,
1890	struct cred	*cr)
1891{
1892	struct snode *sp = VTOS(vp);
1893	u_offset_t io_off;
1894	size_t io_len;
1895	size_t blksz;
1896	u_offset_t blkoff;
1897	int err = 0;
1898	struct buf *bp;
1899	u_offset_t size;
1900	size_t adj_klustsize;
1901	u_offset_t tmpoff;
1902
1903	/*
1904	 * Destroy read ahead value since we are really going to write.
1905	 */
1906	sp->s_nextr = 0;
1907	size = SPEC_SIZE(VTOS(sp->s_commonvp));
1908
1909	adj_klustsize = klustsize;
1910
1911	blkoff = (pp->p_offset / adj_klustsize) * adj_klustsize;
1912
1913	if (blkoff + adj_klustsize <= size)
1914		blksz = adj_klustsize;
1915	else
1916		blksz = size - blkoff;
1917
1918	/*
1919	 * Find a kluster that fits in one contiguous chunk.
1920	 */
1921	pp = pvn_write_kluster(vp, pp, &tmpoff, &io_len, blkoff,
1922		blksz, flags);
1923	io_off = tmpoff;
1924
1925	/*
1926	 * Check for page length rounding problems
1927	 * XXX - Is this necessary?
1928	 */
1929	if (io_off + io_len > size) {
1930		ASSERT((io_off + io_len) - size < PAGESIZE);
1931		io_len = size - io_off;
1932	}
1933
1934	bp = spec_startio(vp, pp, io_off, io_len, B_WRITE | flags);
1935
1936	/*
1937	 * Wait for i/o to complete if the request is not B_ASYNC.
1938	 */
1939	if ((flags & B_ASYNC) == 0) {
1940		err = biowait(bp);
1941		pageio_done(bp);
1942		pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
1943	}
1944
1945	if (offp)
1946		*offp = io_off;
1947	if (lenp)
1948		*lenp = io_len;
1949	TRACE_4(TR_FAC_SPECFS, TR_SPECFS_PUTAPAGE,
1950		"specfs putapage:vp %p offp %p snode %p err %d",
1951		vp, offp, sp, err);
1952	return (err);
1953}
1954
1955/*
1956 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
1957 */
1958static struct buf *
1959spec_startio(
1960	struct vnode *vp,
1961	page_t		*pp,
1962	u_offset_t	io_off,
1963	size_t		io_len,
1964	int		flags)
1965{
1966	struct buf *bp;
1967
1968	bp = pageio_setup(pp, io_len, vp, flags);
1969
1970	bp->b_edev = vp->v_rdev;
1971	bp->b_dev = cmpdev(vp->v_rdev);
1972	bp->b_blkno = btodt(io_off);
1973	bp->b_un.b_addr = (caddr_t)0;
1974
1975	(void) bdev_strategy(bp);
1976
1977	if (flags & B_READ)
1978		lwp_stat_update(LWP_STAT_INBLK, 1);
1979	else
1980		lwp_stat_update(LWP_STAT_OUBLK, 1);
1981
1982	return (bp);
1983}
1984
1985static int
1986spec_poll(
1987	struct vnode	*vp,
1988	short		events,
1989	int		anyyet,
1990	short		*reventsp,
1991	struct pollhead **phpp)
1992{
1993	dev_t dev;
1994	int error;
1995
1996	if (vp->v_type == VBLK)
1997		error = fs_poll(vp, events, anyyet, reventsp, phpp);
1998	else {
1999		ASSERT(vp->v_type == VCHR);
2000		dev = vp->v_rdev;
2001		if (STREAMSTAB(getmajor(dev))) {
2002			ASSERT(vp->v_stream != NULL);
2003			error = strpoll(vp->v_stream, events, anyyet,
2004			    reventsp, phpp);
2005		} else if (devopsp[getmajor(dev)]->devo_cb_ops->cb_chpoll) {
2006			error = cdev_poll(dev, events, anyyet, reventsp, phpp);
2007		} else {
2008			error = fs_poll(vp, events, anyyet, reventsp, phpp);
2009		}
2010	}
2011	return (error);
2012}
2013
2014/*
2015 * This routine is called through the cdevsw[] table to handle
2016 * traditional mmap'able devices that support a d_mmap function.
2017 */
2018/*ARGSUSED8*/
2019int
2020spec_segmap(
2021	dev_t dev,
2022	off_t off,
2023	struct as *as,
2024	caddr_t *addrp,
2025	off_t len,
2026	uint_t prot,
2027	uint_t maxprot,
2028	uint_t flags,
2029	struct cred *cred)
2030{
2031	struct segdev_crargs dev_a;
2032	int (*mapfunc)(dev_t dev, off_t off, int prot);
2033	size_t i;
2034	int	error;
2035
2036	if ((mapfunc = devopsp[getmajor(dev)]->devo_cb_ops->cb_mmap) == nodev)
2037		return (ENODEV);
2038	TRACE_4(TR_FAC_SPECFS, TR_SPECFS_SEGMAP,
2039		"specfs segmap:dev %x as %p len %lx prot %x",
2040		dev, as, len, prot);
2041
2042	/*
2043	 * Character devices that support the d_mmap
2044	 * interface can only be mmap'ed shared.
2045	 */
2046	if ((flags & MAP_TYPE) != MAP_SHARED)
2047		return (EINVAL);
2048
2049	/*
2050	 * Check to ensure that the entire range is
2051	 * legal and we are not trying to map in
2052	 * more than the device will let us.
2053	 */
2054	for (i = 0; i < len; i += PAGESIZE) {
2055		if (cdev_mmap(mapfunc, dev, off + i, maxprot) == -1)
2056			return (ENXIO);
2057	}
2058
2059	as_rangelock(as);
2060	if ((flags & MAP_FIXED) == 0) {
2061		/*
2062		 * Pick an address w/o worrying about
2063		 * any vac alignment constraints.
2064		 */
2065		map_addr(addrp, len, (offset_t)off, 0, flags);
2066		if (*addrp == NULL) {
2067			as_rangeunlock(as);
2068			return (ENOMEM);
2069		}
2070	} else {
2071		/*
2072		 * User-specified address; blow away any previous mappings.
2073		 */
2074		(void) as_unmap(as, *addrp, len);
2075	}
2076
2077	dev_a.mapfunc = mapfunc;
2078	dev_a.dev = dev;
2079	dev_a.offset = off;
2080	dev_a.prot = (uchar_t)prot;
2081	dev_a.maxprot = (uchar_t)maxprot;
2082	dev_a.hat_flags = 0;
2083	dev_a.hat_attr = 0;
2084	dev_a.devmap_data = NULL;
2085
2086	error = as_map(as, *addrp, len, segdev_create, &dev_a);
2087	as_rangeunlock(as);
2088	return (error);
2089}
2090
2091int
2092spec_char_map(
2093	dev_t dev,
2094	offset_t off,
2095	struct as *as,
2096	caddr_t *addrp,
2097	size_t len,
2098	uchar_t prot,
2099	uchar_t maxprot,
2100	uint_t flags,
2101	struct cred *cred)
2102{
2103	int error = 0;
2104	major_t maj = getmajor(dev);
2105	int map_flag;
2106	int (*segmap)(dev_t, off_t, struct as *,
2107	    caddr_t *, off_t, uint_t, uint_t, uint_t, cred_t *);
2108	int (*devmap)(dev_t, devmap_cookie_t, offset_t,
2109		size_t, size_t *, uint_t);
2110	int (*mmap)(dev_t dev, off_t off, int prot);
2111
2112	/*
2113	 * Character device: let the device driver
2114	 * pick the appropriate segment driver.
2115	 *
2116	 * 4.x compat.: allow 'NULL' cb_segmap => spec_segmap
2117	 * Kindness: allow 'nulldev' cb_segmap => spec_segmap
2118	 */
2119	segmap = devopsp[maj]->devo_cb_ops->cb_segmap;
2120	if (segmap == NULL || segmap == nulldev || segmap == nodev) {
2121		mmap = devopsp[maj]->devo_cb_ops->cb_mmap;
2122		map_flag = devopsp[maj]->devo_cb_ops->cb_flag;
2123
2124		/*
2125		 * Use old mmap framework if the driver has both mmap
2126		 * and devmap entry points.  This is to prevent the
2127		 * system from calling invalid devmap entry point
2128		 * for some drivers that might have put garbage in the
2129		 * devmap entry point.
2130		 */
2131		if ((map_flag & D_DEVMAP) || mmap == NULL ||
2132		    mmap == nulldev || mmap == nodev) {
2133			devmap = devopsp[maj]->devo_cb_ops->cb_devmap;
2134
2135			/*
2136			 * If driver provides devmap entry point in
2137			 * cb_ops but not xx_segmap(9E), call
2138			 * devmap_setup with default settings
2139			 * (NULL) for callback_ops and driver
2140			 * callback private data
2141			 */
2142			if (devmap == nodev || devmap == NULL ||
2143			    devmap == nulldev)
2144				return (ENODEV);
2145
2146			error = devmap_setup(dev, off, as, addrp,
2147			    len, prot, maxprot, flags, cred);
2148
2149			return (error);
2150		} else
2151			segmap = spec_segmap;
2152	} else
2153		segmap = cdev_segmap;
2154
2155	return ((*segmap)(dev, (off_t)off, as, addrp, len, prot,
2156	    maxprot, flags, cred));
2157}
2158
2159static int
2160spec_map(
2161	struct vnode *vp,
2162	offset_t off,
2163	struct as *as,
2164	caddr_t *addrp,
2165	size_t len,
2166	uchar_t prot,
2167	uchar_t maxprot,
2168	uint_t flags,
2169	struct cred *cred)
2170{
2171	int error = 0;
2172
2173	if (vp->v_flag & VNOMAP)
2174		return (ENOSYS);
2175
2176	/*
2177	 * If file is locked, fail mapping attempt.
2178	 */
2179	if (vn_has_flocks(vp))
2180		return (EAGAIN);
2181
2182	if (vp->v_type == VCHR) {
2183		return (spec_char_map(vp->v_rdev, off, as, addrp, len, prot,
2184		    maxprot, flags, cred));
2185	} else if (vp->v_type == VBLK) {
2186		struct segvn_crargs vn_a;
2187		struct vnode *cvp;
2188		struct snode *sp;
2189
2190		/*
2191		 * Block device, use segvn mapping to the underlying commonvp
2192		 * for pages.
2193		 */
2194		if (off > spec_maxoffset(vp))
2195			return (ENXIO);
2196
2197		sp = VTOS(vp);
2198		cvp = sp->s_commonvp;
2199		ASSERT(cvp != NULL);
2200
2201		if (off < 0 || ((offset_t)(off + len) < 0))
2202			return (ENXIO);
2203
2204		as_rangelock(as);
2205		if ((flags & MAP_FIXED) == 0) {
2206			map_addr(addrp, len, off, 1, flags);
2207			if (*addrp == NULL) {
2208				as_rangeunlock(as);
2209				return (ENOMEM);
2210			}
2211		} else {
2212			/*
2213			 * User-specified address; blow away any
2214			 * previous mappings.
2215			 */
2216			(void) as_unmap(as, *addrp, len);
2217		}
2218
2219		vn_a.vp = cvp;
2220		vn_a.offset = off;
2221		vn_a.type = flags & MAP_TYPE;
2222		vn_a.prot = (uchar_t)prot;
2223		vn_a.maxprot = (uchar_t)maxprot;
2224		vn_a.flags = flags & ~MAP_TYPE;
2225		vn_a.cred = cred;
2226		vn_a.amp = NULL;
2227		vn_a.szc = 0;
2228		vn_a.lgrp_mem_policy_flags = 0;
2229
2230		error = as_map(as, *addrp, len, segvn_create, &vn_a);
2231		as_rangeunlock(as);
2232	} else
2233		return (ENODEV);
2234
2235	return (error);
2236}
2237
2238/*ARGSUSED1*/
2239static int
2240spec_addmap(
2241	struct vnode *vp,	/* the common vnode */
2242	offset_t off,
2243	struct as *as,
2244	caddr_t addr,
2245	size_t len,		/* how many bytes to add */
2246	uchar_t prot,
2247	uchar_t maxprot,
2248	uint_t flags,
2249	struct cred *cred)
2250{
2251	int error = 0;
2252	struct snode *csp = VTOS(vp);
2253	ulong_t npages;
2254
2255	ASSERT(vp != NULL && VTOS(vp)->s_commonvp == vp);
2256
2257	/*
2258	 * XXX	Given the above assertion, this might not
2259	 *	be a particularly sensible thing to test.
2260	 */
2261	if (vp->v_flag & VNOMAP)
2262		return (ENOSYS);
2263
2264	npages = btopr(len);
2265	LOCK_CSP(csp);
2266	csp->s_mapcnt += npages;
2267
2268	UNLOCK_CSP(csp);
2269	return (error);
2270}
2271
2272/*ARGSUSED1*/
2273static int
2274spec_delmap(
2275	struct vnode *vp,	/* the common vnode */
2276	offset_t off,
2277	struct as *as,
2278	caddr_t addr,
2279	size_t len,		/* how many bytes to take away */
2280	uint_t prot,
2281	uint_t maxprot,
2282	uint_t flags,
2283	struct cred *cred)
2284{
2285	struct snode *csp = VTOS(vp);
2286	ulong_t npages;
2287	long mcnt;
2288
2289	/* segdev passes us the common vp */
2290
2291	ASSERT(vp != NULL && VTOS(vp)->s_commonvp == vp);
2292
2293	/*
2294	 * XXX	Given the above assertion, this might not
2295	 *	be a particularly sensible thing to test..
2296	 */
2297	if (vp->v_flag & VNOMAP)
2298		return (ENOSYS);
2299
2300	npages = btopr(len);
2301
2302	LOCK_CSP(csp);
2303	mutex_enter(&csp->s_lock);
2304	mcnt = (csp->s_mapcnt -= npages);
2305
2306	if (mcnt == 0) {
2307		/*
2308		 * Call the close routine when the last reference of any
2309		 * kind through any [s, v]node goes away.  The s_dip hold
2310		 * on the devinfo node is released when the vnode is
2311		 * destroyed.
2312		 */
2313		if (csp->s_count == 0) {
2314			csp->s_flag &= ~(SNEEDCLOSE | SSIZEVALID);
2315
2316			/* See comment in spec_close() */
2317			if (csp->s_flag & (SCLONE | SSELFCLONE))
2318				csp->s_flag &= ~SDIPSET;
2319
2320			mutex_exit(&csp->s_lock);
2321
2322			(void) device_close(vp, 0, cred);
2323		} else
2324			mutex_exit(&csp->s_lock);
2325
2326		mutex_enter(&csp->s_lock);
2327	}
2328	ASSERT(mcnt >= 0);
2329
2330	UNLOCK_CSP_LOCK_HELD(csp);
2331	mutex_exit(&csp->s_lock);
2332
2333	return (0);
2334}
2335
2336static int
2337spec_dump(struct vnode *vp, caddr_t addr, int bn, int count)
2338{
2339	ASSERT(vp->v_type == VBLK);
2340	return (bdev_dump(vp->v_rdev, addr, bn, count));
2341}
2342
2343
2344/*
2345 * Do i/o on the given page list from/to vp, io_off for io_len.
2346 * Flags are composed of:
2347 * 	{B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_READ, B_WRITE}
2348 * If B_ASYNC is not set i/o is waited for.
2349 */
2350/*ARGSUSED5*/
2351static int
2352spec_pageio(
2353	struct vnode *vp,
2354	page_t	*pp,
2355	u_offset_t io_off,
2356	size_t	io_len,
2357	int	flags,
2358	cred_t	*cr)
2359{
2360	struct buf *bp = NULL;
2361	int err = 0;
2362
2363	if (pp == NULL)
2364		return (EINVAL);
2365
2366	bp = spec_startio(vp, pp, io_off, io_len, flags);
2367
2368	/*
2369	 * Wait for i/o to complete if the request is not B_ASYNC.
2370	 */
2371	if ((flags & B_ASYNC) == 0) {
2372		err = biowait(bp);
2373		pageio_done(bp);
2374	}
2375	return (err);
2376}
2377
2378/*
2379 * Set ACL on underlying vnode if one exists, or return ENOSYS otherwise.
2380 */
2381int
2382spec_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr)
2383{
2384	struct vnode *realvp;
2385	struct snode *sp = VTOS(vp);
2386	int error;
2387
2388	/*
2389	 * The acl(2) system calls VOP_RWLOCK on the file before setting an
2390	 * ACL, but since specfs does not serialize reads and writes, this
2391	 * VOP does not do anything.  However, some backing file systems may
2392	 * expect the lock to be held before setting an ACL, so it is taken
2393	 * here privately to avoid serializing specfs reads and writes.
2394	 */
2395	if ((realvp = sp->s_realvp) != NULL) {
2396		(void) VOP_RWLOCK(realvp, V_WRITELOCK_TRUE, NULL);
2397		error = VOP_SETSECATTR(realvp, vsap, flag, cr);
2398		(void) VOP_RWUNLOCK(realvp, V_WRITELOCK_TRUE, NULL);
2399		return (error);
2400	} else
2401		return (fs_nosys());
2402}
2403
2404/*
2405 * Get ACL from underlying vnode if one exists, or fabricate it from
2406 * the permissions returned by spec_getattr() otherwise.
2407 */
2408int
2409spec_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr)
2410{
2411	struct vnode *realvp;
2412	struct snode *sp = VTOS(vp);
2413
2414	if ((realvp = sp->s_realvp) != NULL)
2415		return (VOP_GETSECATTR(realvp, vsap, flag, cr));
2416	else
2417		return (fs_fab_acl(vp, vsap, flag, cr));
2418}
2419
2420int
2421spec_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
2422{
2423	vnode_t *realvp;
2424	struct snode *sp = VTOS(vp);
2425
2426	if ((realvp = sp->s_realvp) != NULL)
2427		return (VOP_PATHCONF(realvp, cmd, valp, cr));
2428	else
2429		return (fs_pathconf(vp, cmd, valp, cr));
2430}
2431