nfs_vnops.c revision 766:c521de78a32f
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 *
26 *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
27 *	All rights reserved.
28 */
29
30#pragma ident	"%Z%%M%	%I%	%E% SMI"
31
32#include <sys/param.h>
33#include <sys/types.h>
34#include <sys/systm.h>
35#include <sys/cred.h>
36#include <sys/time.h>
37#include <sys/vnode.h>
38#include <sys/vfs.h>
39#include <sys/file.h>
40#include <sys/filio.h>
41#include <sys/uio.h>
42#include <sys/buf.h>
43#include <sys/mman.h>
44#include <sys/pathname.h>
45#include <sys/dirent.h>
46#include <sys/debug.h>
47#include <sys/vmsystm.h>
48#include <sys/fcntl.h>
49#include <sys/flock.h>
50#include <sys/swap.h>
51#include <sys/errno.h>
52#include <sys/strsubr.h>
53#include <sys/sysmacros.h>
54#include <sys/kmem.h>
55#include <sys/cmn_err.h>
56#include <sys/pathconf.h>
57#include <sys/utsname.h>
58#include <sys/dnlc.h>
59#include <sys/acl.h>
60#include <sys/atomic.h>
61#include <sys/policy.h>
62#include <sys/sdt.h>
63
64#include <rpc/types.h>
65#include <rpc/auth.h>
66#include <rpc/clnt.h>
67
68#include <nfs/nfs.h>
69#include <nfs/nfs_clnt.h>
70#include <nfs/rnode.h>
71#include <nfs/nfs_acl.h>
72#include <nfs/lm.h>
73
74#include <vm/hat.h>
75#include <vm/as.h>
76#include <vm/page.h>
77#include <vm/pvn.h>
78#include <vm/seg.h>
79#include <vm/seg_map.h>
80#include <vm/seg_kpm.h>
81#include <vm/seg_vn.h>
82
83#include <fs/fs_subr.h>
84
85#include <sys/ddi.h>
86
87static int	nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
88			cred_t *);
89static int	nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
90static int	nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
91static int	nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
92static int	nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
93static int	nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
94static int	nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *);
95static int	nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
96static int	nfs_bio(struct buf *, cred_t *);
97static int	nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
98			page_t *[], size_t, struct seg *, caddr_t,
99			enum seg_rw, cred_t *);
100static void	nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
101			cred_t *);
102static int	nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
103			int, cred_t *);
104static int	nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
105			int, cred_t *);
106static void	nfs_delmap_callback(struct as *, void *, uint_t);
107
108/*
109 * Error flags used to pass information about certain special errors
110 * which need to be handled specially.
111 */
112#define	NFS_EOF			-98
113
114/*
115 * These are the vnode ops routines which implement the vnode interface to
116 * the networked file system.  These routines just take their parameters,
117 * make them look networkish by putting the right info into interface structs,
118 * and then calling the appropriate remote routine(s) to do the work.
119 *
120 * Note on directory name lookup cacheing:  If we detect a stale fhandle,
121 * we purge the directory cache relative to that vnode.  This way, the
122 * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
123 * more details on rnode locking.
124 */
125
126static int	nfs_open(vnode_t **, int, cred_t *);
127static int	nfs_close(vnode_t *, int, int, offset_t, cred_t *);
128static int	nfs_read(vnode_t *, struct uio *, int, cred_t *,
129			caller_context_t *);
130static int	nfs_write(vnode_t *, struct uio *, int, cred_t *,
131			caller_context_t *);
132static int	nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *);
133static int	nfs_getattr(vnode_t *, struct vattr *, int, cred_t *);
134static int	nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
135			caller_context_t *);
136static int	nfs_access(vnode_t *, int, int, cred_t *);
137static int	nfs_accessx(void *, int, cred_t *);
138static int	nfs_readlink(vnode_t *, struct uio *, cred_t *);
139static int	nfs_fsync(vnode_t *, int, cred_t *);
140static void	nfs_inactive(vnode_t *, cred_t *);
141static int	nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
142			int, vnode_t *, cred_t *);
143static int	nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
144			int, vnode_t **, cred_t *, int);
145static int	nfs_remove(vnode_t *, char *, cred_t *);
146static int	nfs_link(vnode_t *, vnode_t *, char *, cred_t *);
147static int	nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *);
148static int	nfs_mkdir(vnode_t *, char *, struct vattr *,
149			vnode_t **, cred_t *);
150static int	nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *);
151static int	nfs_symlink(vnode_t *, char *, struct vattr *, char *,
152			cred_t *);
153static int	nfs_readdir(vnode_t *, struct uio *, cred_t *, int *);
154static int	nfs_fid(vnode_t *, fid_t *);
155static int	nfs_rwlock(vnode_t *, int, caller_context_t *);
156static void	nfs_rwunlock(vnode_t *, int, caller_context_t *);
157static int	nfs_seek(vnode_t *, offset_t, offset_t *);
158static int	nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
159			page_t *[], size_t, struct seg *, caddr_t,
160			enum seg_rw, cred_t *);
161static int	nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *);
162static int	nfs_map(vnode_t *, offset_t, struct as *, caddr_t *,
163			size_t, uchar_t, uchar_t, uint_t, cred_t *);
164static int	nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t,
165			size_t, uchar_t, uchar_t, uint_t, cred_t *);
166static int	nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
167			struct flk_callback *, cred_t *);
168static int	nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
169			cred_t *, caller_context_t *);
170static int	nfs_realvp(vnode_t *, vnode_t **);
171static int	nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t,
172			size_t, uint_t, uint_t, uint_t, cred_t *);
173static int	nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *);
174static int	nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
175			cred_t *);
176static int	nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
177static int	nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
178static int	nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *);
179
180struct vnodeops *nfs_vnodeops;
181
182const fs_operation_def_t nfs_vnodeops_template[] = {
183	VOPNAME_OPEN, nfs_open,
184	VOPNAME_CLOSE, nfs_close,
185	VOPNAME_READ, nfs_read,
186	VOPNAME_WRITE, nfs_write,
187	VOPNAME_IOCTL, nfs_ioctl,
188	VOPNAME_GETATTR, nfs_getattr,
189	VOPNAME_SETATTR, nfs_setattr,
190	VOPNAME_ACCESS, nfs_access,
191	VOPNAME_LOOKUP, nfs_lookup,
192	VOPNAME_CREATE, nfs_create,
193	VOPNAME_REMOVE, nfs_remove,
194	VOPNAME_LINK, nfs_link,
195	VOPNAME_RENAME, nfs_rename,
196	VOPNAME_MKDIR, nfs_mkdir,
197	VOPNAME_RMDIR, nfs_rmdir,
198	VOPNAME_READDIR, nfs_readdir,
199	VOPNAME_SYMLINK, nfs_symlink,
200	VOPNAME_READLINK, nfs_readlink,
201	VOPNAME_FSYNC, nfs_fsync,
202	VOPNAME_INACTIVE, (fs_generic_func_p) nfs_inactive,
203	VOPNAME_FID, nfs_fid,
204	VOPNAME_RWLOCK, nfs_rwlock,
205	VOPNAME_RWUNLOCK, (fs_generic_func_p) nfs_rwunlock,
206	VOPNAME_SEEK, nfs_seek,
207	VOPNAME_FRLOCK, nfs_frlock,
208	VOPNAME_SPACE, nfs_space,
209	VOPNAME_REALVP, nfs_realvp,
210	VOPNAME_GETPAGE, nfs_getpage,
211	VOPNAME_PUTPAGE, nfs_putpage,
212	VOPNAME_MAP, (fs_generic_func_p) nfs_map,
213	VOPNAME_ADDMAP, (fs_generic_func_p) nfs_addmap,
214	VOPNAME_DELMAP, nfs_delmap,
215	VOPNAME_DUMP, nfs_dump,
216	VOPNAME_PATHCONF, nfs_pathconf,
217	VOPNAME_PAGEIO, nfs_pageio,
218	VOPNAME_SETSECATTR, nfs_setsecattr,
219	VOPNAME_GETSECATTR, nfs_getsecattr,
220	VOPNAME_SHRLOCK, nfs_shrlock,
221	NULL, NULL
222};
223
224/*
225 * XXX:  This is referenced in modstubs.s
226 */
227struct vnodeops *
228nfs_getvnodeops(void)
229{
230	return (nfs_vnodeops);
231}
232
233/* ARGSUSED */
234static int
235nfs_open(vnode_t **vpp, int flag, cred_t *cr)
236{
237	int error;
238	struct vattr va;
239	rnode_t *rp;
240	vnode_t *vp;
241
242	vp = *vpp;
243	rp = VTOR(vp);
244	if (nfs_zone() != VTOMI(vp)->mi_zone)
245		return (EIO);
246	mutex_enter(&rp->r_statelock);
247	if (rp->r_cred == NULL) {
248		crhold(cr);
249		rp->r_cred = cr;
250	}
251	mutex_exit(&rp->r_statelock);
252
253	/*
254	 * If there is no cached data or if close-to-open
255	 * consistency checking is turned off, we can avoid
256	 * the over the wire getattr.  Otherwise, if the
257	 * file system is mounted readonly, then just verify
258	 * the caches are up to date using the normal mechanism.
259	 * Else, if the file is not mmap'd, then just mark
260	 * the attributes as timed out.  They will be refreshed
261	 * and the caches validated prior to being used.
262	 * Else, the file system is mounted writeable so
263	 * force an over the wire GETATTR in order to ensure
264	 * that all cached data is valid.
265	 */
266	if (vp->v_count > 1 ||
267	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
268	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
269		if (vn_is_readonly(vp))
270			error = nfs_validate_caches(vp, cr);
271		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
272			PURGE_ATTRCACHE(vp);
273			error = 0;
274		} else {
275			va.va_mask = AT_ALL;
276			error = nfs_getattr_otw(vp, &va, cr);
277		}
278	} else
279		error = 0;
280
281	return (error);
282}
283
284static int
285nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
286{
287	rnode_t *rp;
288	int error;
289	struct vattr va;
290
291	/*
292	 * zone_enter(2) prevents processes from changing zones with NFS files
293	 * open; if we happen to get here from the wrong zone we can't do
294	 * anything over the wire.
295	 */
296	if (VTOMI(vp)->mi_zone != nfs_zone()) {
297		/*
298		 * We could attempt to clean up locks, except we're sure
299		 * that the current process didn't acquire any locks on
300		 * the file: any attempt to lock a file belong to another zone
301		 * will fail, and one can't lock an NFS file and then change
302		 * zones, as that fails too.
303		 *
304		 * Returning an error here is the sane thing to do.  A
305		 * subsequent call to VN_RELE() which translates to a
306		 * nfs_inactive() will clean up state: if the zone of the
307		 * vnode's origin is still alive and kicking, an async worker
308		 * thread will handle the request (from the correct zone), and
309		 * everything (minus the final nfs_getattr_otw() call) should
310		 * be OK. If the zone is going away nfs_async_inactive() will
311		 * throw away cached pages inline.
312		 */
313		return (EIO);
314	}
315
316	/*
317	 * If we are using local locking for this filesystem, then
318	 * release all of the SYSV style record locks.  Otherwise,
319	 * we are doing network locking and we need to release all
320	 * of the network locks.  All of the locks held by this
321	 * process on this file are released no matter what the
322	 * incoming reference count is.
323	 */
324	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
325		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
326		cleanshares(vp, ttoproc(curthread)->p_pid);
327	} else
328		nfs_lockrelease(vp, flag, offset, cr);
329
330	if (count > 1)
331		return (0);
332
333	/*
334	 * If the file has been `unlinked', then purge the
335	 * DNLC so that this vnode will get reycled quicker
336	 * and the .nfs* file on the server will get removed.
337	 */
338	rp = VTOR(vp);
339	if (rp->r_unldvp != NULL)
340		dnlc_purge_vp(vp);
341
342	/*
343	 * If the file was open for write and there are pages,
344	 * then if the file system was mounted using the "no-close-
345	 *	to-open" semantics, then start an asynchronous flush
346	 *	of the all of the pages in the file.
347	 * else the file system was not mounted using the "no-close-
348	 *	to-open" semantics, then do a synchronous flush and
349	 *	commit of all of the dirty and uncommitted pages.
350	 *
351	 * The asynchronous flush of the pages in the "nocto" path
352	 * mostly just associates a cred pointer with the rnode so
353	 * writes which happen later will have a better chance of
354	 * working.  It also starts the data being written to the
355	 * server, but without unnecessarily delaying the application.
356	 */
357	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
358		if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
359			error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC, cr);
360			if (error == EAGAIN)
361				error = 0;
362		} else
363			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
364		if (!error) {
365			mutex_enter(&rp->r_statelock);
366			error = rp->r_error;
367			rp->r_error = 0;
368			mutex_exit(&rp->r_statelock);
369		}
370	} else {
371		mutex_enter(&rp->r_statelock);
372		error = rp->r_error;
373		rp->r_error = 0;
374		mutex_exit(&rp->r_statelock);
375	}
376
377	/*
378	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
379	 * refresh the attribute cache with a set of attributes which
380	 * weren't returned from a WRITE.  This will enable the close-
381	 * to-open processing to work.
382	 */
383	if (rp->r_flags & RWRITEATTR)
384		(void) nfs_getattr_otw(vp, &va, cr);
385
386	return (error);
387}
388
389/* ARGSUSED */
390static int
391nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
392	caller_context_t *ct)
393{
394	rnode_t *rp;
395	u_offset_t off;
396	offset_t diff;
397	int on;
398	size_t n;
399	caddr_t base;
400	uint_t flags;
401	int error;
402	mntinfo_t *mi;
403
404	rp = VTOR(vp);
405	mi = VTOMI(vp);
406
407	if (nfs_zone() != mi->mi_zone)
408		return (EIO);
409
410	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
411
412	if (vp->v_type != VREG)
413		return (EISDIR);
414
415	if (uiop->uio_resid == 0)
416		return (0);
417
418	if (uiop->uio_loffset > MAXOFF32_T)
419		return (EFBIG);
420
421	if (uiop->uio_loffset < 0 ||
422	    uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
423		return (EINVAL);
424
425	/*
426	 * Bypass VM if caching has been disabled (e.g., locking) or if
427	 * using client-side direct I/O and the file is not mmap'd and
428	 * there are no cached pages.
429	 */
430	if ((vp->v_flag & VNOCACHE) ||
431	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
432	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
433		size_t bufsize;
434		size_t resid = 0;
435
436		/*
437		 * Let's try to do read in as large a chunk as we can
438		 * (Filesystem (NFS client) bsize if possible/needed).
439		 * For V3, this is 32K and for V2, this is 8K.
440		 */
441		bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
442		base = kmem_alloc(bufsize, KM_SLEEP);
443		do {
444			n = MIN(uiop->uio_resid, bufsize);
445			error = nfsread(vp, base, uiop->uio_offset, n,
446			    &resid, cr);
447			if (!error) {
448				n -= resid;
449				error = uiomove(base, n, UIO_READ, uiop);
450			}
451		} while (!error && uiop->uio_resid > 0 && n > 0);
452		kmem_free(base, bufsize);
453		return (error);
454	}
455
456	error = 0;
457
458	do {
459		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
460		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
461		n = MIN(MAXBSIZE - on, uiop->uio_resid);
462
463		error = nfs_validate_caches(vp, cr);
464		if (error)
465			break;
466
467		mutex_enter(&rp->r_statelock);
468		diff = rp->r_size - uiop->uio_loffset;
469		mutex_exit(&rp->r_statelock);
470		if (diff <= 0)
471			break;
472		if (diff < n)
473			n = (size_t)diff;
474
475		base = segmap_getmapflt(segkmap, vp, off + on, n, 1, S_READ);
476
477		error = uiomove(base + on, n, UIO_READ, uiop);
478
479		if (!error) {
480			/*
481			 * If read a whole block or read to eof,
482			 * won't need this buffer again soon.
483			 */
484			mutex_enter(&rp->r_statelock);
485			if (n + on == MAXBSIZE ||
486			    uiop->uio_loffset == rp->r_size)
487				flags = SM_DONTNEED;
488			else
489				flags = 0;
490			mutex_exit(&rp->r_statelock);
491			error = segmap_release(segkmap, base, flags);
492		} else
493			(void) segmap_release(segkmap, base, 0);
494	} while (!error && uiop->uio_resid > 0);
495
496	return (error);
497}
498
499/* ARGSUSED */
500static int
501nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
502	caller_context_t *ct)
503{
504	rnode_t *rp;
505	u_offset_t off;
506	caddr_t base;
507	uint_t flags;
508	int remainder;
509	size_t n;
510	int on;
511	int error;
512	int resid;
513	offset_t offset;
514	rlim_t limit;
515	mntinfo_t *mi;
516
517	rp = VTOR(vp);
518
519	mi = VTOMI(vp);
520	if (nfs_zone() != mi->mi_zone)
521		return (EIO);
522	if (vp->v_type != VREG)
523		return (EISDIR);
524
525	if (uiop->uio_resid == 0)
526		return (0);
527
528	if (ioflag & FAPPEND) {
529		struct vattr va;
530
531		/*
532		 * Must serialize if appending.
533		 */
534		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
535			nfs_rw_exit(&rp->r_rwlock);
536			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
537			    INTR(vp)))
538				return (EINTR);
539		}
540
541		va.va_mask = AT_SIZE;
542		error = nfsgetattr(vp, &va, cr);
543		if (error)
544			return (error);
545		uiop->uio_loffset = va.va_size;
546	}
547
548	if (uiop->uio_loffset > MAXOFF32_T)
549		return (EFBIG);
550
551	offset = uiop->uio_loffset + uiop->uio_resid;
552
553	if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
554		return (EINVAL);
555
556	if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
557		limit = MAXOFF32_T;
558	} else {
559		limit = (rlim_t)uiop->uio_llimit;
560	}
561
562	/*
563	 * Check to make sure that the process will not exceed
564	 * its limit on file size.  It is okay to write up to
565	 * the limit, but not beyond.  Thus, the write which
566	 * reaches the limit will be short and the next write
567	 * will return an error.
568	 */
569	remainder = 0;
570	if (offset > limit) {
571		remainder = offset - limit;
572		uiop->uio_resid = limit - uiop->uio_offset;
573		if (uiop->uio_resid <= 0) {
574			proc_t *p = ttoproc(curthread);
575
576			uiop->uio_resid += remainder;
577			mutex_enter(&p->p_lock);
578			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
579			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
580			mutex_exit(&p->p_lock);
581			return (EFBIG);
582		}
583	}
584
585	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
586		return (EINTR);
587
588	/*
589	 * Bypass VM if caching has been disabled (e.g., locking) or if
590	 * using client-side direct I/O and the file is not mmap'd and
591	 * there are no cached pages.
592	 */
593	if ((vp->v_flag & VNOCACHE) ||
594	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
595	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
596		size_t bufsize;
597		int count;
598		uint_t org_offset;
599
600nfs_fwrite:
601		if (rp->r_flags & RSTALE) {
602			resid = uiop->uio_resid;
603			offset = uiop->uio_loffset;
604			error = rp->r_error;
605			goto bottom;
606		}
607		bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
608		base = kmem_alloc(bufsize, KM_SLEEP);
609		do {
610			resid = uiop->uio_resid;
611			offset = uiop->uio_loffset;
612			count = MIN(uiop->uio_resid, bufsize);
613			org_offset = uiop->uio_offset;
614			error = uiomove(base, count, UIO_WRITE, uiop);
615			if (!error) {
616				error = nfswrite(vp, base, org_offset,
617				    count, cr);
618			}
619		} while (!error && uiop->uio_resid > 0);
620		kmem_free(base, bufsize);
621		goto bottom;
622	}
623
624	do {
625		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
626		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
627		n = MIN(MAXBSIZE - on, uiop->uio_resid);
628
629		resid = uiop->uio_resid;
630		offset = uiop->uio_loffset;
631
632		if (rp->r_flags & RSTALE) {
633			error = rp->r_error;
634			break;
635		}
636
637		/*
638		 * Don't create dirty pages faster than they
639		 * can be cleaned so that the system doesn't
640		 * get imbalanced.  If the async queue is
641		 * maxed out, then wait for it to drain before
642		 * creating more dirty pages.  Also, wait for
643		 * any threads doing pagewalks in the vop_getattr
644		 * entry points so that they don't block for
645		 * long periods.
646		 */
647		mutex_enter(&rp->r_statelock);
648		while ((mi->mi_max_threads != 0 &&
649		    rp->r_awcount > 2 * mi->mi_max_threads) ||
650		    rp->r_gcount > 0)
651			cv_wait(&rp->r_cv, &rp->r_statelock);
652		mutex_exit(&rp->r_statelock);
653
654		if (segmap_kpm) {
655			int pon = uiop->uio_loffset & PAGEOFFSET;
656			size_t pn = MIN(PAGESIZE - pon, uiop->uio_resid);
657			int pagecreate;
658
659			mutex_enter(&rp->r_statelock);
660			pagecreate = (pon == 0) && (pn == PAGESIZE ||
661				uiop->uio_loffset + pn >= rp->r_size);
662			mutex_exit(&rp->r_statelock);
663
664			base = segmap_getmapflt(segkmap, vp, off + on,
665						pn, !pagecreate, S_WRITE);
666
667			error = writerp(rp, base + pon, n, uiop, pagecreate);
668
669		} else {
670			base = segmap_getmapflt(segkmap, vp, off + on,
671						n, 0, S_READ);
672			error = writerp(rp, base + on, n, uiop, 0);
673		}
674
675		if (!error) {
676			if (mi->mi_flags & MI_NOAC)
677				flags = SM_WRITE;
678			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
679				/*
680				 * Have written a whole block.
681				 * Start an asynchronous write
682				 * and mark the buffer to
683				 * indicate that it won't be
684				 * needed again soon.
685				 */
686				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
687			} else
688				flags = 0;
689			if ((ioflag & (FSYNC|FDSYNC)) ||
690			    (rp->r_flags & ROUTOFSPACE)) {
691				flags &= ~SM_ASYNC;
692				flags |= SM_WRITE;
693			}
694			error = segmap_release(segkmap, base, flags);
695		} else {
696			(void) segmap_release(segkmap, base, 0);
697			/*
698			 * In the event that we got an access error while
699			 * faulting in a page for a write-only file just
700			 * force a write.
701			 */
702			if (error == EACCES)
703				goto nfs_fwrite;
704		}
705	} while (!error && uiop->uio_resid > 0);
706
707bottom:
708	if (error) {
709		uiop->uio_resid = resid + remainder;
710		uiop->uio_loffset = offset;
711	} else
712		uiop->uio_resid += remainder;
713
714	nfs_rw_exit(&rp->r_lkserlock);
715
716	return (error);
717}
718
719/*
720 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
721 */
722static int
723nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
724	int flags, cred_t *cr)
725{
726	struct buf *bp;
727	int error;
728
729	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
730	bp = pageio_setup(pp, len, vp, flags);
731	ASSERT(bp != NULL);
732
733	/*
734	 * pageio_setup should have set b_addr to 0.  This
735	 * is correct since we want to do I/O on a page
736	 * boundary.  bp_mapin will use this addr to calculate
737	 * an offset, and then set b_addr to the kernel virtual
738	 * address it allocated for us.
739	 */
740	ASSERT(bp->b_un.b_addr == 0);
741
742	bp->b_edev = 0;
743	bp->b_dev = 0;
744	bp->b_lblkno = lbtodb(off);
745	bp->b_file = vp;
746	bp->b_offset = (offset_t)off;
747	bp_mapin(bp);
748
749	error = nfs_bio(bp, cr);
750
751	bp_mapout(bp);
752	pageio_done(bp);
753
754	return (error);
755}
756
757/*
758 * Write to file.  Writes to remote server in largest size
759 * chunks that the server can handle.  Write is synchronous.
760 */
761static int
762nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
763{
764	rnode_t *rp;
765	mntinfo_t *mi;
766	struct nfswriteargs wa;
767	struct nfsattrstat ns;
768	int error;
769	int tsize;
770	int douprintf;
771
772	douprintf = 1;
773
774	rp = VTOR(vp);
775	mi = VTOMI(vp);
776
777	ASSERT(nfs_zone() == mi->mi_zone);
778
779	wa.wa_args = &wa.wa_args_buf;
780	wa.wa_fhandle = *VTOFH(vp);
781
782	do {
783		tsize = MIN(mi->mi_curwrite, count);
784		wa.wa_data = base;
785		wa.wa_begoff = offset;
786		wa.wa_totcount = tsize;
787		wa.wa_count = tsize;
788		wa.wa_offset = offset;
789
790		if (mi->mi_io_kstats) {
791			mutex_enter(&mi->mi_lock);
792			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
793			mutex_exit(&mi->mi_lock);
794		}
795		wa.wa_mblk = NULL;
796		do {
797			error = rfs2call(mi, RFS_WRITE,
798			    xdr_writeargs, (caddr_t)&wa,
799			    xdr_attrstat, (caddr_t)&ns, cr,
800			    &douprintf, &ns.ns_status, 0, NULL);
801		} while (error == ENFS_TRYAGAIN);
802		if (mi->mi_io_kstats) {
803			mutex_enter(&mi->mi_lock);
804			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
805			mutex_exit(&mi->mi_lock);
806		}
807
808		if (!error) {
809			error = geterrno(ns.ns_status);
810			/*
811			 * Can't check for stale fhandle and purge caches
812			 * here because pages are held by nfs_getpage.
813			 * Just mark the attribute cache as timed out
814			 * and set RWRITEATTR to indicate that the file
815			 * was modified with a WRITE operation.
816			 */
817			if (!error) {
818				count -= tsize;
819				base += tsize;
820				offset += tsize;
821				if (mi->mi_io_kstats) {
822					mutex_enter(&mi->mi_lock);
823				    KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
824				    KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
825					    tsize;
826					mutex_exit(&mi->mi_lock);
827				}
828				lwp_stat_update(LWP_STAT_OUBLK, 1);
829				mutex_enter(&rp->r_statelock);
830				PURGE_ATTRCACHE_LOCKED(rp);
831				rp->r_flags |= RWRITEATTR;
832				mutex_exit(&rp->r_statelock);
833			}
834		}
835	} while (!error && count);
836
837	return (error);
838}
839
840/*
841 * Read from a file.  Reads data in largest chunks our interface can handle.
842 */
843static int
844nfsread(vnode_t *vp, caddr_t base, uint_t offset, int count, size_t *residp,
845	cred_t *cr)
846{
847	mntinfo_t *mi;
848	struct nfsreadargs ra;
849	struct nfsrdresult rr;
850	int tsize;
851	int error;
852	int douprintf;
853	failinfo_t fi;
854	rnode_t *rp;
855	struct vattr va;
856	hrtime_t t;
857
858	rp = VTOR(vp);
859	mi = VTOMI(vp);
860
861	ASSERT(nfs_zone() == mi->mi_zone);
862
863	douprintf = 1;
864
865	ra.ra_fhandle = *VTOFH(vp);
866
867	fi.vp = vp;
868	fi.fhp = (caddr_t)&ra.ra_fhandle;
869	fi.copyproc = nfscopyfh;
870	fi.lookupproc = nfslookup;
871	fi.xattrdirproc = acl_getxattrdir2;
872
873	do {
874		if (mi->mi_io_kstats) {
875			mutex_enter(&mi->mi_lock);
876			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
877			mutex_exit(&mi->mi_lock);
878		}
879
880		do {
881			tsize = MIN(mi->mi_curread, count);
882			rr.rr_data = base;
883			ra.ra_offset = offset;
884			ra.ra_totcount = tsize;
885			ra.ra_count = tsize;
886			t = gethrtime();
887			error = rfs2call(mi, RFS_READ,
888			    xdr_readargs, (caddr_t)&ra,
889			    xdr_rdresult, (caddr_t)&rr, cr,
890			    &douprintf, &rr.rr_status, 0, &fi);
891		} while (error == ENFS_TRYAGAIN);
892
893		if (mi->mi_io_kstats) {
894			mutex_enter(&mi->mi_lock);
895			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
896			mutex_exit(&mi->mi_lock);
897		}
898
899		if (!error) {
900			error = geterrno(rr.rr_status);
901			if (!error) {
902				count -= rr.rr_count;
903				base += rr.rr_count;
904				offset += rr.rr_count;
905				if (mi->mi_io_kstats) {
906					mutex_enter(&mi->mi_lock);
907					KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
908					KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
909					    rr.rr_count;
910					mutex_exit(&mi->mi_lock);
911				}
912				lwp_stat_update(LWP_STAT_INBLK, 1);
913			}
914		}
915	} while (!error && count && rr.rr_count == tsize);
916
917	*residp = count;
918
919	if (!error) {
920		/*
921		 * Since no error occurred, we have the current
922		 * attributes and we need to do a cache check and then
923		 * potentially update the cached attributes.  We can't
924		 * use the normal attribute check and cache mechanisms
925		 * because they might cause a cache flush which would
926		 * deadlock.  Instead, we just check the cache to see
927		 * if the attributes have changed.  If it is, then we
928		 * just mark the attributes as out of date.  The next
929		 * time that the attributes are checked, they will be
930		 * out of date, new attributes will be fetched, and
931		 * the page cache will be flushed.  If the attributes
932		 * weren't changed, then we just update the cached
933		 * attributes with these attributes.
934		 */
935		/*
936		 * If NFS_ACL is supported on the server, then the
937		 * attributes returned by server may have minimal
938		 * permissions sometimes denying access to users having
939		 * proper access.  To get the proper attributes, mark
940		 * the attributes as expired so that they will be
941		 * regotten via the NFS_ACL GETATTR2 procedure.
942		 */
943		error = nattr_to_vattr(vp, &rr.rr_attr, &va);
944		mutex_enter(&rp->r_statelock);
945		if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
946		    (mi->mi_flags & MI_ACL)) {
947			mutex_exit(&rp->r_statelock);
948			PURGE_ATTRCACHE(vp);
949		} else {
950			if (rp->r_mtime <= t) {
951				nfs_attrcache_va(vp, &va);
952			}
953			mutex_exit(&rp->r_statelock);
954		}
955	}
956
957	return (error);
958}
959
960/* ARGSUSED */
961static int
962nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
963{
964
965	if (nfs_zone() != VTOMI(vp)->mi_zone)
966		return (EIO);
967	switch (cmd) {
968		case _FIODIRECTIO:
969			return (nfs_directio(vp, (int)arg, cr));
970		default:
971			return (ENOTTY);
972	}
973}
974
975static int
976nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
977{
978	int error;
979	rnode_t *rp;
980
981	if (nfs_zone() != VTOMI(vp)->mi_zone)
982		return (EIO);
983	/*
984	 * If it has been specified that the return value will
985	 * just be used as a hint, and we are only being asked
986	 * for size, fsid or rdevid, then return the client's
987	 * notion of these values without checking to make sure
988	 * that the attribute cache is up to date.
989	 * The whole point is to avoid an over the wire GETATTR
990	 * call.
991	 */
992	rp = VTOR(vp);
993	if (flags & ATTR_HINT) {
994		if (vap->va_mask ==
995		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
996			mutex_enter(&rp->r_statelock);
997			if (vap->va_mask | AT_SIZE)
998				vap->va_size = rp->r_size;
999			if (vap->va_mask | AT_FSID)
1000				vap->va_fsid = rp->r_attr.va_fsid;
1001			if (vap->va_mask | AT_RDEV)
1002				vap->va_rdev = rp->r_attr.va_rdev;
1003			mutex_exit(&rp->r_statelock);
1004			return (0);
1005		}
1006	}
1007
1008	/*
1009	 * Only need to flush pages if asking for the mtime
1010	 * and if there any dirty pages or any outstanding
1011	 * asynchronous (write) requests for this file.
1012	 */
1013	if (vap->va_mask & AT_MTIME) {
1014		if (vn_has_cached_data(vp) &&
1015		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1016			mutex_enter(&rp->r_statelock);
1017			rp->r_gcount++;
1018			mutex_exit(&rp->r_statelock);
1019			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1020			mutex_enter(&rp->r_statelock);
1021			if (error && (error == ENOSPC || error == EDQUOT)) {
1022				if (!rp->r_error)
1023					rp->r_error = error;
1024			}
1025			if (--rp->r_gcount == 0)
1026				cv_broadcast(&rp->r_cv);
1027			mutex_exit(&rp->r_statelock);
1028		}
1029	}
1030
1031	return (nfsgetattr(vp, vap, cr));
1032}
1033
1034/*ARGSUSED4*/
1035static int
1036nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1037		caller_context_t *ct)
1038{
1039	int error;
1040	uint_t mask;
1041	struct vattr va;
1042
1043	mask = vap->va_mask;
1044
1045	if (mask & AT_NOSET)
1046		return (EINVAL);
1047
1048	if ((mask & AT_SIZE) &&
1049	    vap->va_type == VREG &&
1050	    vap->va_size > MAXOFF32_T)
1051		return (EFBIG);
1052
1053	if (nfs_zone() != VTOMI(vp)->mi_zone)
1054		return (EIO);
1055
1056	va.va_mask = AT_UID | AT_MODE;
1057
1058	error = nfsgetattr(vp, &va, cr);
1059	if (error)
1060		return (error);
1061
1062	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1063			vp);
1064
1065	if (error)
1066		return (error);
1067
1068	return (nfssetattr(vp, vap, flags, cr));
1069}
1070
1071static int
1072nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1073{
1074	int error;
1075	uint_t mask;
1076	struct nfssaargs args;
1077	struct nfsattrstat ns;
1078	int douprintf;
1079	rnode_t *rp;
1080	struct vattr va;
1081	mode_t omode;
1082	mntinfo_t *mi;
1083	vsecattr_t *vsp;
1084	hrtime_t t;
1085
1086	mask = vap->va_mask;
1087
1088	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1089
1090	rp = VTOR(vp);
1091
1092	/*
1093	 * Only need to flush pages if there are any pages and
1094	 * if the file is marked as dirty in some fashion.  The
1095	 * file must be flushed so that we can accurately
1096	 * determine the size of the file and the cached data
1097	 * after the SETATTR returns.  A file is considered to
1098	 * be dirty if it is either marked with RDIRTY, has
1099	 * outstanding i/o's active, or is mmap'd.  In this
1100	 * last case, we can't tell whether there are dirty
1101	 * pages, so we flush just to be sure.
1102	 */
1103	if (vn_has_cached_data(vp) &&
1104	    ((rp->r_flags & RDIRTY) ||
1105	    rp->r_count > 0 ||
1106	    rp->r_mapcnt > 0)) {
1107		ASSERT(vp->v_type != VCHR);
1108		error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1109		if (error && (error == ENOSPC || error == EDQUOT)) {
1110			mutex_enter(&rp->r_statelock);
1111			if (!rp->r_error)
1112				rp->r_error = error;
1113			mutex_exit(&rp->r_statelock);
1114		}
1115	}
1116
1117	/*
1118	 * If the system call was utime(2) or utimes(2) and the
1119	 * application did not specify the times, then set the
1120	 * mtime nanosecond field to 1 billion.  This will get
1121	 * translated from 1 billion nanoseconds to 1 million
1122	 * microseconds in the over the wire request.  The
1123	 * server will use 1 million in the microsecond field
1124	 * to tell whether both the mtime and atime should be
1125	 * set to the server's current time.
1126	 *
1127	 * This is an overload of the protocol and should be
1128	 * documented in the NFS Version 2 protocol specification.
1129	 */
1130	if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1131		vap->va_mtime.tv_nsec = 1000000000;
1132		if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1133		    NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1134			error = vattr_to_sattr(vap, &args.saa_sa);
1135		} else {
1136			/*
1137			 * Use server times. vap time values will not be used.
1138			 * To ensure no time overflow, make sure vap has
1139			 * valid values, but retain the original values.
1140			 */
1141			timestruc_t	mtime = vap->va_mtime;
1142			timestruc_t	atime = vap->va_atime;
1143			time_t		now;
1144
1145			now = gethrestime_sec();
1146			if (NFS_TIME_T_OK(now)) {
1147				/* Just in case server does not know of this */
1148				vap->va_mtime.tv_sec = now;
1149				vap->va_atime.tv_sec = now;
1150			} else {
1151				vap->va_mtime.tv_sec = 0;
1152				vap->va_atime.tv_sec = 0;
1153			}
1154			error = vattr_to_sattr(vap, &args.saa_sa);
1155			/* set vap times back on */
1156			vap->va_mtime = mtime;
1157			vap->va_atime = atime;
1158		}
1159	} else {
1160		/* Either do not set times or use the client specified times */
1161		error = vattr_to_sattr(vap, &args.saa_sa);
1162	}
1163	if (error) {
1164		/* req time field(s) overflow - return immediately */
1165		return (error);
1166	}
1167	args.saa_fh = *VTOFH(vp);
1168
1169	va.va_mask = AT_MODE;
1170	error = nfsgetattr(vp, &va, cr);
1171	if (error)
1172		return (error);
1173	omode = va.va_mode;
1174
1175	mi = VTOMI(vp);
1176
1177	douprintf = 1;
1178
1179	t = gethrtime();
1180
1181	error = rfs2call(mi, RFS_SETATTR,
1182	    xdr_saargs, (caddr_t)&args,
1183	    xdr_attrstat, (caddr_t)&ns, cr,
1184	    &douprintf, &ns.ns_status, 0, NULL);
1185
1186	/*
1187	 * Purge the access cache and ACL cache if changing either the
1188	 * owner of the file, the group owner, or the mode.  These may
1189	 * change the access permissions of the file, so purge old
1190	 * information and start over again.
1191	 */
1192	if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1193		(void) nfs_access_purge_rp(rp);
1194		if (rp->r_secattr != NULL) {
1195			mutex_enter(&rp->r_statelock);
1196			vsp = rp->r_secattr;
1197			rp->r_secattr = NULL;
1198			mutex_exit(&rp->r_statelock);
1199			if (vsp != NULL)
1200				nfs_acl_free(vsp);
1201		}
1202	}
1203
1204	if (!error) {
1205		error = geterrno(ns.ns_status);
1206		if (!error) {
1207			/*
1208			 * If changing the size of the file, invalidate
1209			 * any local cached data which is no longer part
1210			 * of the file.  We also possibly invalidate the
1211			 * last page in the file.  We could use
1212			 * pvn_vpzero(), but this would mark the page as
1213			 * modified and require it to be written back to
1214			 * the server for no particularly good reason.
1215			 * This way, if we access it, then we bring it
1216			 * back in.  A read should be cheaper than a
1217			 * write.
1218			 */
1219			if (mask & AT_SIZE) {
1220				nfs_invalidate_pages(vp,
1221				    (vap->va_size & PAGEMASK), cr);
1222			}
1223			(void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1224			/*
1225			 * If NFS_ACL is supported on the server, then the
1226			 * attributes returned by server may have minimal
1227			 * permissions sometimes denying access to users having
1228			 * proper access.  To get the proper attributes, mark
1229			 * the attributes as expired so that they will be
1230			 * regotten via the NFS_ACL GETATTR2 procedure.
1231			 */
1232			if (mi->mi_flags & MI_ACL) {
1233				PURGE_ATTRCACHE(vp);
1234			}
1235			/*
1236			 * This next check attempts to deal with NFS
1237			 * servers which can not handle increasing
1238			 * the size of the file via setattr.  Most
1239			 * of these servers do not return an error,
1240			 * but do not change the size of the file.
1241			 * Hence, this check and then attempt to set
1242			 * the file size by writing 1 byte at the
1243			 * offset of the end of the file that we need.
1244			 */
1245			if ((mask & AT_SIZE) &&
1246			    ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1247				char zb = '\0';
1248
1249				error = nfswrite(vp, &zb,
1250				    vap->va_size - sizeof (zb),
1251				    sizeof (zb), cr);
1252			}
1253			/*
1254			 * Some servers will change the mode to clear the setuid
1255			 * and setgid bits when changing the uid or gid.  The
1256			 * client needs to compensate appropriately.
1257			 */
1258			if (mask & (AT_UID | AT_GID)) {
1259				int terror;
1260
1261				va.va_mask = AT_MODE;
1262				terror = nfsgetattr(vp, &va, cr);
1263				if (!terror &&
1264				    (((mask & AT_MODE) &&
1265				    va.va_mode != vap->va_mode) ||
1266				    (!(mask & AT_MODE) &&
1267				    va.va_mode != omode))) {
1268					va.va_mask = AT_MODE;
1269					if (mask & AT_MODE)
1270						va.va_mode = vap->va_mode;
1271					else
1272						va.va_mode = omode;
1273					(void) nfssetattr(vp, &va, 0, cr);
1274				}
1275			}
1276		} else {
1277			PURGE_ATTRCACHE(vp);
1278			PURGE_STALE_FH(error, vp, cr);
1279		}
1280	} else {
1281		PURGE_ATTRCACHE(vp);
1282	}
1283
1284	return (error);
1285}
1286
1287static int
1288nfs_accessx(void *vp, int mode, cred_t *cr)
1289{
1290	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1291	return (nfs_access(vp, mode, 0, cr));
1292}
1293
1294static int
1295nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
1296{
1297	struct vattr va;
1298	int error;
1299	mntinfo_t *mi;
1300	int shift = 0;
1301
1302	mi = VTOMI(vp);
1303
1304	if (nfs_zone() != mi->mi_zone)
1305		return (EIO);
1306	if (mi->mi_flags & MI_ACL) {
1307		error = acl_access2(vp, mode, flags, cr);
1308		if (mi->mi_flags & MI_ACL)
1309			return (error);
1310	}
1311
1312	va.va_mask = AT_MODE | AT_UID | AT_GID;
1313	error = nfsgetattr(vp, &va, cr);
1314	if (error)
1315		return (error);
1316
1317	/*
1318	 * Disallow write attempts on read-only
1319	 * file systems, unless the file is a
1320	 * device node.
1321	 */
1322	if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1323		return (EROFS);
1324
1325	/*
1326	 * Disallow attempts to access mandatory lock files.
1327	 */
1328	if ((mode & (VWRITE | VREAD | VEXEC)) &&
1329	    MANDLOCK(vp, va.va_mode))
1330		return (EACCES);
1331
1332	/*
1333	 * Access check is based on only
1334	 * one of owner, group, public.
1335	 * If not owner, then check group.
1336	 * If not a member of the group,
1337	 * then check public access.
1338	 */
1339	if (crgetuid(cr) != va.va_uid) {
1340		shift += 3;
1341		if (!groupmember(va.va_gid, cr))
1342			shift += 3;
1343	}
1344found:
1345	mode &= ~(va.va_mode << shift);
1346	if (mode == 0)
1347		return (0);
1348
1349	return (secpolicy_vnode_access(cr, vp, va.va_uid, mode));
1350}
1351
1352static int nfs_do_symlink_cache = 1;
1353
1354static int
1355nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr)
1356{
1357	int error;
1358	struct nfsrdlnres rl;
1359	rnode_t *rp;
1360	int douprintf;
1361	failinfo_t fi;
1362
1363	/*
1364	 * We want to be consistent with UFS semantics so we will return
1365	 * EINVAL instead of ENXIO. This violates the XNFS spec and
1366	 * the RFC 1094, which are wrong any way. BUGID 1138002.
1367	 */
1368	if (vp->v_type != VLNK)
1369		return (EINVAL);
1370
1371	if (nfs_zone() != VTOMI(vp)->mi_zone)
1372		return (EIO);
1373
1374	rp = VTOR(vp);
1375	if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1376		error = nfs_validate_caches(vp, cr);
1377		if (error)
1378			return (error);
1379		mutex_enter(&rp->r_statelock);
1380		if (rp->r_symlink.contents != NULL) {
1381			error = uiomove(rp->r_symlink.contents,
1382			    rp->r_symlink.len, UIO_READ, uiop);
1383			mutex_exit(&rp->r_statelock);
1384			return (error);
1385		}
1386		mutex_exit(&rp->r_statelock);
1387	}
1388
1389
1390	rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1391
1392	fi.vp = vp;
1393	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1394	fi.copyproc = nfscopyfh;
1395	fi.lookupproc = nfslookup;
1396	fi.xattrdirproc = acl_getxattrdir2;
1397
1398	douprintf = 1;
1399
1400	error = rfs2call(VTOMI(vp), RFS_READLINK,
1401	    xdr_fhandle, (caddr_t)VTOFH(vp),
1402	    xdr_rdlnres, (caddr_t)&rl, cr,
1403	    &douprintf, &rl.rl_status, 0, &fi);
1404
1405	if (error) {
1406
1407		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1408		return (error);
1409	}
1410
1411	error = geterrno(rl.rl_status);
1412	if (!error) {
1413		error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1414		if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1415			mutex_enter(&rp->r_statelock);
1416			if (rp->r_symlink.contents == NULL) {
1417				rp->r_symlink.contents = rl.rl_data;
1418				rp->r_symlink.len = (int)rl.rl_count;
1419				rp->r_symlink.size = NFS_MAXPATHLEN;
1420				mutex_exit(&rp->r_statelock);
1421			} else {
1422				mutex_exit(&rp->r_statelock);
1423
1424				kmem_free((void *)rl.rl_data,
1425				    NFS_MAXPATHLEN);
1426			}
1427		} else {
1428
1429			kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1430		}
1431	} else {
1432		PURGE_STALE_FH(error, vp, cr);
1433
1434		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1435	}
1436
1437	/*
1438	 * Conform to UFS semantics (see comment above)
1439	 */
1440	return (error == ENXIO ? EINVAL : error);
1441}
1442
1443/*
1444 * Flush local dirty pages to stable storage on the server.
1445 *
1446 * If FNODSYNC is specified, then there is nothing to do because
1447 * metadata changes are not cached on the client before being
1448 * sent to the server.
1449 */
1450static int
1451nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
1452{
1453	int error;
1454
1455	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1456		return (0);
1457
1458	if (nfs_zone() != VTOMI(vp)->mi_zone)
1459		return (EIO);
1460
1461	error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1462	if (!error)
1463		error = VTOR(vp)->r_error;
1464	return (error);
1465}
1466
1467
1468/*
1469 * Weirdness: if the file was removed or the target of a rename
1470 * operation while it was open, it got renamed instead.  Here we
1471 * remove the renamed file.
1472 */
1473static void
1474nfs_inactive(vnode_t *vp, cred_t *cr)
1475{
1476	rnode_t *rp;
1477
1478	ASSERT(vp != DNLC_NO_VNODE);
1479
1480	/*
1481	 * If this is coming from the wrong zone, we let someone in the right
1482	 * zone take care of it asynchronously.  We can get here due to
1483	 * VN_RELE() being called from pageout() or fsflush().  This call may
1484	 * potentially turn into an expensive no-op if, for instance, v_count
1485	 * gets incremented in the meantime, but it's still correct.
1486	 */
1487	if (nfs_zone() != VTOMI(vp)->mi_zone) {
1488		nfs_async_inactive(vp, cr, nfs_inactive);
1489		return;
1490	}
1491
1492	rp = VTOR(vp);
1493redo:
1494	if (rp->r_unldvp != NULL) {
1495		/*
1496		 * Save the vnode pointer for the directory where the
1497		 * unlinked-open file got renamed, then set it to NULL
1498		 * to prevent another thread from getting here before
1499		 * we're done with the remove.  While we have the
1500		 * statelock, make local copies of the pertinent rnode
1501		 * fields.  If we weren't to do this in an atomic way, the
1502		 * the unl* fields could become inconsistent with respect
1503		 * to each other due to a race condition between this
1504		 * code and nfs_remove().  See bug report 1034328.
1505		 */
1506		mutex_enter(&rp->r_statelock);
1507		if (rp->r_unldvp != NULL) {
1508			vnode_t *unldvp;
1509			char *unlname;
1510			cred_t *unlcred;
1511			struct nfsdiropargs da;
1512			enum nfsstat status;
1513			int douprintf;
1514			int error;
1515
1516			unldvp = rp->r_unldvp;
1517			rp->r_unldvp = NULL;
1518			unlname = rp->r_unlname;
1519			rp->r_unlname = NULL;
1520			unlcred = rp->r_unlcred;
1521			rp->r_unlcred = NULL;
1522			mutex_exit(&rp->r_statelock);
1523
1524			/*
1525			 * If there are any dirty pages left, then flush
1526			 * them.  This is unfortunate because they just
1527			 * may get thrown away during the remove operation,
1528			 * but we have to do this for correctness.
1529			 */
1530			if (vn_has_cached_data(vp) &&
1531			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1532				ASSERT(vp->v_type != VCHR);
1533				error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1534				if (error) {
1535					mutex_enter(&rp->r_statelock);
1536					if (!rp->r_error)
1537						rp->r_error = error;
1538					mutex_exit(&rp->r_statelock);
1539				}
1540			}
1541
1542			/*
1543			 * Do the remove operation on the renamed file
1544			 */
1545			setdiropargs(&da, unlname, unldvp);
1546
1547			douprintf = 1;
1548
1549			(void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1550			    xdr_diropargs, (caddr_t)&da,
1551			    xdr_enum, (caddr_t)&status, unlcred,
1552			    &douprintf, &status, 0, NULL);
1553
1554			if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1555				nfs_purge_rddir_cache(unldvp);
1556			PURGE_ATTRCACHE(unldvp);
1557
1558			/*
1559			 * Release stuff held for the remove
1560			 */
1561			VN_RELE(unldvp);
1562			kmem_free(unlname, MAXNAMELEN);
1563			crfree(unlcred);
1564			goto redo;
1565		}
1566		mutex_exit(&rp->r_statelock);
1567	}
1568
1569	rp_addfree(rp, cr);
1570}
1571
1572/*
1573 * Remote file system operations having to do with directory manipulation.
1574 */
1575
1576static int
1577nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1578	int flags, vnode_t *rdir, cred_t *cr)
1579{
1580	int error;
1581	vnode_t *vp;
1582	vnode_t *avp = NULL;
1583	rnode_t *drp;
1584
1585	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1586		return (EPERM);
1587
1588	drp = VTOR(dvp);
1589
1590	/*
1591	 * Are we looking up extended attributes?  If so, "dvp" is
1592	 * the file or directory for which we want attributes, and
1593	 * we need a lookup of the hidden attribute directory
1594	 * before we lookup the rest of the path.
1595	 */
1596	if (flags & LOOKUP_XATTR) {
1597		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1598		mntinfo_t *mi;
1599
1600		mi = VTOMI(dvp);
1601		if (!(mi->mi_flags & MI_EXTATTR))
1602			return (EINVAL);
1603
1604		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1605			return (EINTR);
1606
1607		(void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1608		if (avp == NULL)
1609			error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1610		else
1611			error = 0;
1612
1613		nfs_rw_exit(&drp->r_rwlock);
1614
1615		if (error) {
1616			if (mi->mi_flags & MI_EXTATTR)
1617				return (error);
1618			return (EINVAL);
1619		}
1620		dvp = avp;
1621		drp = VTOR(dvp);
1622	}
1623
1624	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1625		error = EINTR;
1626		goto out;
1627	}
1628
1629	error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1630
1631	nfs_rw_exit(&drp->r_rwlock);
1632
1633	/*
1634	 * If vnode is a device, create special vnode.
1635	 */
1636	if (!error && IS_DEVVP(*vpp)) {
1637		vp = *vpp;
1638		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1639		VN_RELE(vp);
1640	}
1641
1642out:
1643	if (avp != NULL)
1644		VN_RELE(avp);
1645
1646	return (error);
1647}
1648
1649static int nfs_lookup_neg_cache = 1;
1650
1651#ifdef DEBUG
1652static int nfs_lookup_dnlc_hits = 0;
1653static int nfs_lookup_dnlc_misses = 0;
1654static int nfs_lookup_dnlc_neg_hits = 0;
1655static int nfs_lookup_dnlc_disappears = 0;
1656static int nfs_lookup_dnlc_lookups = 0;
1657#endif
1658
1659/* ARGSUSED */
1660int
1661nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1662	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1663{
1664	int error;
1665
1666	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1667
1668	/*
1669	 * If lookup is for "", just return dvp.  Don't need
1670	 * to send it over the wire, look it up in the dnlc,
1671	 * or perform any access checks.
1672	 */
1673	if (*nm == '\0') {
1674		VN_HOLD(dvp);
1675		*vpp = dvp;
1676		return (0);
1677	}
1678
1679	/*
1680	 * Can't do lookups in non-directories.
1681	 */
1682	if (dvp->v_type != VDIR)
1683		return (ENOTDIR);
1684
1685	/*
1686	 * If we're called with RFSCALL_SOFT, it's important that
1687	 * the only rfscall is one we make directly; if we permit
1688	 * an access call because we're looking up "." or validating
1689	 * a dnlc hit, we'll deadlock because that rfscall will not
1690	 * have the RFSCALL_SOFT set.
1691	 */
1692	if (rfscall_flags & RFSCALL_SOFT)
1693		goto callit;
1694
1695	/*
1696	 * If lookup is for ".", just return dvp.  Don't need
1697	 * to send it over the wire or look it up in the dnlc,
1698	 * just need to check access.
1699	 */
1700	if (strcmp(nm, ".") == 0) {
1701		error = nfs_access(dvp, VEXEC, 0, cr);
1702		if (error)
1703			return (error);
1704		VN_HOLD(dvp);
1705		*vpp = dvp;
1706		return (0);
1707	}
1708
1709	/*
1710	 * Lookup this name in the DNLC.  If there was a valid entry,
1711	 * then return the results of the lookup.
1712	 */
1713	error = nfslookup_dnlc(dvp, nm, vpp, cr);
1714	if (error || *vpp != NULL)
1715		return (error);
1716
1717callit:
1718	error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1719
1720	return (error);
1721}
1722
1723static int
1724nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1725{
1726	int error;
1727	vnode_t *vp;
1728
1729	ASSERT(*nm != '\0');
1730	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1731
1732	/*
1733	 * Lookup this name in the DNLC.  If successful, then validate
1734	 * the caches and then recheck the DNLC.  The DNLC is rechecked
1735	 * just in case this entry got invalidated during the call
1736	 * to nfs_validate_caches.
1737	 *
1738	 * An assumption is being made that it is safe to say that a
1739	 * file exists which may not on the server.  Any operations to
1740	 * the server will fail with ESTALE.
1741	 */
1742#ifdef DEBUG
1743	nfs_lookup_dnlc_lookups++;
1744#endif
1745	vp = dnlc_lookup(dvp, nm);
1746	if (vp != NULL) {
1747		VN_RELE(vp);
1748		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1749			PURGE_ATTRCACHE(dvp);
1750		}
1751		error = nfs_validate_caches(dvp, cr);
1752		if (error)
1753			return (error);
1754		vp = dnlc_lookup(dvp, nm);
1755		if (vp != NULL) {
1756			error = nfs_access(dvp, VEXEC, 0, cr);
1757			if (error) {
1758				VN_RELE(vp);
1759				return (error);
1760			}
1761			if (vp == DNLC_NO_VNODE) {
1762				VN_RELE(vp);
1763#ifdef DEBUG
1764				nfs_lookup_dnlc_neg_hits++;
1765#endif
1766				return (ENOENT);
1767			}
1768			*vpp = vp;
1769#ifdef DEBUG
1770			nfs_lookup_dnlc_hits++;
1771#endif
1772			return (0);
1773		}
1774#ifdef DEBUG
1775		nfs_lookup_dnlc_disappears++;
1776#endif
1777	}
1778#ifdef DEBUG
1779	else
1780		nfs_lookup_dnlc_misses++;
1781#endif
1782
1783	*vpp = NULL;
1784
1785	return (0);
1786}
1787
1788static int
1789nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1790	int rfscall_flags)
1791{
1792	int error;
1793	struct nfsdiropargs da;
1794	struct nfsdiropres dr;
1795	int douprintf;
1796	failinfo_t fi;
1797	hrtime_t t;
1798
1799	ASSERT(*nm != '\0');
1800	ASSERT(dvp->v_type == VDIR);
1801	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1802
1803	setdiropargs(&da, nm, dvp);
1804
1805	fi.vp = dvp;
1806	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1807	fi.copyproc = nfscopyfh;
1808	fi.lookupproc = nfslookup;
1809	fi.xattrdirproc = acl_getxattrdir2;
1810
1811	douprintf = 1;
1812
1813	t = gethrtime();
1814
1815	error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1816	    xdr_diropargs, (caddr_t)&da,
1817	    xdr_diropres, (caddr_t)&dr, cr,
1818	    &douprintf, &dr.dr_status, rfscall_flags, &fi);
1819
1820	if (!error) {
1821		error = geterrno(dr.dr_status);
1822		if (!error) {
1823			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1824			    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1825			/*
1826			 * If NFS_ACL is supported on the server, then the
1827			 * attributes returned by server may have minimal
1828			 * permissions sometimes denying access to users having
1829			 * proper access.  To get the proper attributes, mark
1830			 * the attributes as expired so that they will be
1831			 * regotten via the NFS_ACL GETATTR2 procedure.
1832			 */
1833			if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1834				PURGE_ATTRCACHE(*vpp);
1835			}
1836			if (!(rfscall_flags & RFSCALL_SOFT))
1837				dnlc_update(dvp, nm, *vpp);
1838		} else {
1839			PURGE_STALE_FH(error, dvp, cr);
1840			if (error == ENOENT && nfs_lookup_neg_cache)
1841				dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1842		}
1843	}
1844
1845	return (error);
1846}
1847
1848/* ARGSUSED */
1849static int
1850nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1851	int mode, vnode_t **vpp, cred_t *cr, int lfaware)
1852{
1853	int error;
1854	struct nfscreatargs args;
1855	struct nfsdiropres dr;
1856	int douprintf;
1857	vnode_t *vp;
1858	rnode_t *rp;
1859	struct vattr vattr;
1860	rnode_t *drp;
1861	vnode_t *tempvp;
1862	hrtime_t t;
1863
1864	drp = VTOR(dvp);
1865
1866	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1867		return (EPERM);
1868	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1869		return (EINTR);
1870
1871	/*
1872	 * We make a copy of the attributes because the caller does not
1873	 * expect us to change what va points to.
1874	 */
1875	vattr = *va;
1876
1877	/*
1878	 * If the pathname is "", just use dvp.  Don't need
1879	 * to send it over the wire, look it up in the dnlc,
1880	 * or perform any access checks.
1881	 */
1882	if (*nm == '\0') {
1883		error = 0;
1884		VN_HOLD(dvp);
1885		vp = dvp;
1886	/*
1887	 * If the pathname is ".", just use dvp.  Don't need
1888	 * to send it over the wire or look it up in the dnlc,
1889	 * just need to check access.
1890	 */
1891	} else if (strcmp(nm, ".") == 0) {
1892		error = nfs_access(dvp, VEXEC, 0, cr);
1893		if (error) {
1894			nfs_rw_exit(&drp->r_rwlock);
1895			return (error);
1896		}
1897		VN_HOLD(dvp);
1898		vp = dvp;
1899	/*
1900	 * We need to go over the wire, just to be sure whether the
1901	 * file exists or not.  Using the DNLC can be dangerous in
1902	 * this case when making a decision regarding existence.
1903	 */
1904	} else {
1905		error = nfslookup_otw(dvp, nm, &vp, cr, 0);
1906	}
1907	if (!error) {
1908		if (exclusive == EXCL)
1909			error = EEXIST;
1910		else if (vp->v_type == VDIR && (mode & VWRITE))
1911			error = EISDIR;
1912		else {
1913			/*
1914			 * If vnode is a device, create special vnode.
1915			 */
1916			if (IS_DEVVP(vp)) {
1917				tempvp = vp;
1918				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1919				VN_RELE(tempvp);
1920			}
1921			if (!(error = VOP_ACCESS(vp, mode, 0, cr))) {
1922				if ((vattr.va_mask & AT_SIZE) &&
1923				    vp->v_type == VREG) {
1924					vattr.va_mask = AT_SIZE;
1925					error = nfssetattr(vp, &vattr, 0, cr);
1926				}
1927			}
1928		}
1929		nfs_rw_exit(&drp->r_rwlock);
1930		if (error) {
1931			VN_RELE(vp);
1932		} else
1933			*vpp = vp;
1934		return (error);
1935	}
1936
1937	ASSERT(vattr.va_mask & AT_TYPE);
1938	if (vattr.va_type == VREG) {
1939		ASSERT(vattr.va_mask & AT_MODE);
1940		if (MANDMODE(vattr.va_mode)) {
1941			nfs_rw_exit(&drp->r_rwlock);
1942			return (EACCES);
1943		}
1944	}
1945
1946	dnlc_remove(dvp, nm);
1947
1948	setdiropargs(&args.ca_da, nm, dvp);
1949
1950	/*
1951	 * Decide what the group-id of the created file should be.
1952	 * Set it in attribute list as advisory...then do a setattr
1953	 * if the server didn't get it right the first time.
1954	 */
1955	error = setdirgid(dvp, &vattr.va_gid, cr);
1956	if (error) {
1957		nfs_rw_exit(&drp->r_rwlock);
1958		return (error);
1959	}
1960	vattr.va_mask |= AT_GID;
1961
1962	/*
1963	 * This is a completely gross hack to make mknod
1964	 * work over the wire until we can wack the protocol
1965	 */
1966#define	IFCHR		0020000		/* character special */
1967#define	IFBLK		0060000		/* block special */
1968#define	IFSOCK		0140000		/* socket */
1969
1970	/*
1971	 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
1972	 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
1973	 * bits in the minor number where 4.x supports 8 bits.  If the 5.x
1974	 * minor/major numbers <= 8 bits long, compress the device
1975	 * number before sending it. Otherwise, the 4.x server will not
1976	 * create the device with the correct device number and nothing can be
1977	 * done about this.
1978	 */
1979	if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
1980		dev_t d = vattr.va_rdev;
1981		dev32_t dev32;
1982
1983		if (vattr.va_type == VCHR)
1984			vattr.va_mode |= IFCHR;
1985		else
1986			vattr.va_mode |= IFBLK;
1987
1988		(void) cmpldev(&dev32, d);
1989		if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
1990			vattr.va_size = (u_offset_t)dev32;
1991		else
1992			vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
1993
1994		vattr.va_mask |= AT_MODE|AT_SIZE;
1995	} else if (vattr.va_type == VFIFO) {
1996		vattr.va_mode |= IFCHR;		/* xtra kludge for namedpipe */
1997		vattr.va_size = (u_offset_t)NFS_FIFO_DEV;	/* blech */
1998		vattr.va_mask |= AT_MODE|AT_SIZE;
1999	} else if (vattr.va_type == VSOCK) {
2000		vattr.va_mode |= IFSOCK;
2001		/*
2002		 * To avoid triggering bugs in the servers set AT_SIZE
2003		 * (all other RFS_CREATE calls set this).
2004		 */
2005		vattr.va_size = 0;
2006		vattr.va_mask |= AT_MODE|AT_SIZE;
2007	}
2008
2009	args.ca_sa = &args.ca_sa_buf;
2010	error = vattr_to_sattr(&vattr, args.ca_sa);
2011	if (error) {
2012		/* req time field(s) overflow - return immediately */
2013		nfs_rw_exit(&drp->r_rwlock);
2014		return (error);
2015	}
2016
2017	douprintf = 1;
2018
2019	t = gethrtime();
2020
2021	error = rfs2call(VTOMI(dvp), RFS_CREATE,
2022	    xdr_creatargs, (caddr_t)&args,
2023	    xdr_diropres, (caddr_t)&dr, cr,
2024	    &douprintf, &dr.dr_status, 0, NULL);
2025
2026	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2027
2028	if (!error) {
2029		error = geterrno(dr.dr_status);
2030		if (!error) {
2031			if (HAVE_RDDIR_CACHE(drp))
2032				nfs_purge_rddir_cache(dvp);
2033			vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2034			    dvp->v_vfsp, t, cr, NULL, NULL);
2035			/*
2036			 * If NFS_ACL is supported on the server, then the
2037			 * attributes returned by server may have minimal
2038			 * permissions sometimes denying access to users having
2039			 * proper access.  To get the proper attributes, mark
2040			 * the attributes as expired so that they will be
2041			 * regotten via the NFS_ACL GETATTR2 procedure.
2042			 */
2043			if (VTOMI(vp)->mi_flags & MI_ACL) {
2044				PURGE_ATTRCACHE(vp);
2045			}
2046			dnlc_update(dvp, nm, vp);
2047			rp = VTOR(vp);
2048			if (vattr.va_size == 0) {
2049				mutex_enter(&rp->r_statelock);
2050				rp->r_size = 0;
2051				mutex_exit(&rp->r_statelock);
2052				if (vn_has_cached_data(vp)) {
2053					ASSERT(vp->v_type != VCHR);
2054					nfs_invalidate_pages(vp,
2055					    (u_offset_t)0, cr);
2056				}
2057			}
2058
2059			/*
2060			 * Make sure the gid was set correctly.
2061			 * If not, try to set it (but don't lose
2062			 * any sleep over it).
2063			 */
2064			if (vattr.va_gid != rp->r_attr.va_gid) {
2065				vattr.va_mask = AT_GID;
2066				(void) nfssetattr(vp, &vattr, 0, cr);
2067			}
2068
2069			/*
2070			 * If vnode is a device create special vnode
2071			 */
2072			if (IS_DEVVP(vp)) {
2073				*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2074				VN_RELE(vp);
2075			} else
2076				*vpp = vp;
2077		} else {
2078			PURGE_STALE_FH(error, dvp, cr);
2079		}
2080	}
2081
2082	nfs_rw_exit(&drp->r_rwlock);
2083
2084	return (error);
2085}
2086
2087/*
2088 * Weirdness: if the vnode to be removed is open
2089 * we rename it instead of removing it and nfs_inactive
2090 * will remove the new name.
2091 */
2092static int
2093nfs_remove(vnode_t *dvp, char *nm, cred_t *cr)
2094{
2095	int error;
2096	struct nfsdiropargs da;
2097	enum nfsstat status;
2098	vnode_t *vp;
2099	char *tmpname;
2100	int douprintf;
2101	rnode_t *rp;
2102	rnode_t *drp;
2103
2104	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2105		return (EPERM);
2106	drp = VTOR(dvp);
2107	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2108		return (EINTR);
2109
2110	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2111	if (error) {
2112		nfs_rw_exit(&drp->r_rwlock);
2113		return (error);
2114	}
2115
2116	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2117		VN_RELE(vp);
2118		nfs_rw_exit(&drp->r_rwlock);
2119		return (EPERM);
2120	}
2121
2122	/*
2123	 * First just remove the entry from the name cache, as it
2124	 * is most likely the only entry for this vp.
2125	 */
2126	dnlc_remove(dvp, nm);
2127
2128	/*
2129	 * If the file has a v_count > 1 then there may be more than one
2130	 * entry in the name cache due multiple links or an open file,
2131	 * but we don't have the real reference count so flush all
2132	 * possible entries.
2133	 */
2134	if (vp->v_count > 1)
2135		dnlc_purge_vp(vp);
2136
2137	/*
2138	 * Now we have the real reference count on the vnode
2139	 */
2140	rp = VTOR(vp);
2141	mutex_enter(&rp->r_statelock);
2142	if (vp->v_count > 1 &&
2143	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2144		mutex_exit(&rp->r_statelock);
2145		tmpname = newname();
2146		error = nfsrename(dvp, nm, dvp, tmpname, cr);
2147		if (error)
2148			kmem_free(tmpname, MAXNAMELEN);
2149		else {
2150			mutex_enter(&rp->r_statelock);
2151			if (rp->r_unldvp == NULL) {
2152				VN_HOLD(dvp);
2153				rp->r_unldvp = dvp;
2154				if (rp->r_unlcred != NULL)
2155					crfree(rp->r_unlcred);
2156				crhold(cr);
2157				rp->r_unlcred = cr;
2158				rp->r_unlname = tmpname;
2159			} else {
2160				kmem_free(rp->r_unlname, MAXNAMELEN);
2161				rp->r_unlname = tmpname;
2162			}
2163			mutex_exit(&rp->r_statelock);
2164		}
2165	} else {
2166		mutex_exit(&rp->r_statelock);
2167		/*
2168		 * We need to flush any dirty pages which happen to
2169		 * be hanging around before removing the file.  This
2170		 * shouldn't happen very often and mostly on file
2171		 * systems mounted "nocto".
2172		 */
2173		if (vn_has_cached_data(vp) &&
2174		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2175			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
2176			if (error && (error == ENOSPC || error == EDQUOT)) {
2177				mutex_enter(&rp->r_statelock);
2178				if (!rp->r_error)
2179					rp->r_error = error;
2180				mutex_exit(&rp->r_statelock);
2181			}
2182		}
2183
2184		setdiropargs(&da, nm, dvp);
2185
2186		douprintf = 1;
2187
2188		error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2189		    xdr_diropargs, (caddr_t)&da,
2190		    xdr_enum, (caddr_t)&status, cr,
2191		    &douprintf, &status, 0, NULL);
2192
2193		/*
2194		 * The xattr dir may be gone after last attr is removed,
2195		 * so flush it from dnlc.
2196		 */
2197		if (dvp->v_flag & V_XATTRDIR)
2198			dnlc_purge_vp(dvp);
2199
2200		PURGE_ATTRCACHE(dvp);	/* mod time changed */
2201		PURGE_ATTRCACHE(vp);	/* link count changed */
2202
2203		if (!error) {
2204			error = geterrno(status);
2205			if (!error) {
2206				if (HAVE_RDDIR_CACHE(drp))
2207					nfs_purge_rddir_cache(dvp);
2208			} else {
2209				PURGE_STALE_FH(error, dvp, cr);
2210			}
2211		}
2212	}
2213
2214	VN_RELE(vp);
2215
2216	nfs_rw_exit(&drp->r_rwlock);
2217
2218	return (error);
2219}
2220
2221static int
2222nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr)
2223{
2224	int error;
2225	struct nfslinkargs args;
2226	enum nfsstat status;
2227	vnode_t *realvp;
2228	int douprintf;
2229	rnode_t *tdrp;
2230
2231	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2232		return (EPERM);
2233	if (VOP_REALVP(svp, &realvp) == 0)
2234		svp = realvp;
2235
2236	args.la_from = VTOFH(svp);
2237	setdiropargs(&args.la_to, tnm, tdvp);
2238
2239	tdrp = VTOR(tdvp);
2240	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2241		return (EINTR);
2242
2243	dnlc_remove(tdvp, tnm);
2244
2245	douprintf = 1;
2246
2247	error = rfs2call(VTOMI(svp), RFS_LINK,
2248	    xdr_linkargs, (caddr_t)&args,
2249	    xdr_enum, (caddr_t)&status, cr,
2250	    &douprintf, &status, 0, NULL);
2251
2252	PURGE_ATTRCACHE(tdvp);	/* mod time changed */
2253	PURGE_ATTRCACHE(svp);	/* link count changed */
2254
2255	if (!error) {
2256		error = geterrno(status);
2257		if (!error) {
2258			if (HAVE_RDDIR_CACHE(tdrp))
2259				nfs_purge_rddir_cache(tdvp);
2260		}
2261	}
2262
2263	nfs_rw_exit(&tdrp->r_rwlock);
2264
2265	return (error);
2266}
2267
2268static int
2269nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr)
2270{
2271	vnode_t *realvp;
2272
2273	if (nfs_zone() != VTOMI(odvp)->mi_zone)
2274		return (EPERM);
2275	if (VOP_REALVP(ndvp, &realvp) == 0)
2276		ndvp = realvp;
2277
2278	return (nfsrename(odvp, onm, ndvp, nnm, cr));
2279}
2280
2281/*
2282 * nfsrename does the real work of renaming in NFS Version 2.
2283 */
2284static int
2285nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr)
2286{
2287	int error;
2288	enum nfsstat status;
2289	struct nfsrnmargs args;
2290	int douprintf;
2291	vnode_t *nvp;
2292	vnode_t *ovp = NULL;
2293	char *tmpname;
2294	rnode_t *rp;
2295	rnode_t *odrp;
2296	rnode_t *ndrp;
2297
2298	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2299	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2300	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2301		return (EINVAL);
2302
2303	odrp = VTOR(odvp);
2304	ndrp = VTOR(ndvp);
2305	if ((intptr_t)odrp < (intptr_t)ndrp) {
2306		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2307			return (EINTR);
2308		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2309			nfs_rw_exit(&odrp->r_rwlock);
2310			return (EINTR);
2311		}
2312	} else {
2313		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2314			return (EINTR);
2315		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2316			nfs_rw_exit(&ndrp->r_rwlock);
2317			return (EINTR);
2318		}
2319	}
2320
2321	/*
2322	 * Lookup the target file.  If it exists, it needs to be
2323	 * checked to see whether it is a mount point and whether
2324	 * it is active (open).
2325	 */
2326	error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2327	if (!error) {
2328		/*
2329		 * If this file has been mounted on, then just
2330		 * return busy because renaming to it would remove
2331		 * the mounted file system from the name space.
2332		 */
2333		if (vn_mountedvfs(nvp) != NULL) {
2334			VN_RELE(nvp);
2335			nfs_rw_exit(&odrp->r_rwlock);
2336			nfs_rw_exit(&ndrp->r_rwlock);
2337			return (EBUSY);
2338		}
2339
2340		/*
2341		 * Purge the name cache of all references to this vnode
2342		 * so that we can check the reference count to infer
2343		 * whether it is active or not.
2344		 */
2345		/*
2346		 * First just remove the entry from the name cache, as it
2347		 * is most likely the only entry for this vp.
2348		 */
2349		dnlc_remove(ndvp, nnm);
2350		/*
2351		 * If the file has a v_count > 1 then there may be more
2352		 * than one entry in the name cache due multiple links
2353		 * or an open file, but we don't have the real reference
2354		 * count so flush all possible entries.
2355		 */
2356		if (nvp->v_count > 1)
2357			dnlc_purge_vp(nvp);
2358
2359		/*
2360		 * If the vnode is active and is not a directory,
2361		 * arrange to rename it to a
2362		 * temporary file so that it will continue to be
2363		 * accessible.  This implements the "unlink-open-file"
2364		 * semantics for the target of a rename operation.
2365		 * Before doing this though, make sure that the
2366		 * source and target files are not already the same.
2367		 */
2368		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2369			/*
2370			 * Lookup the source name.
2371			 */
2372			error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2373			    cr, 0);
2374
2375			/*
2376			 * The source name *should* already exist.
2377			 */
2378			if (error) {
2379				VN_RELE(nvp);
2380				nfs_rw_exit(&odrp->r_rwlock);
2381				nfs_rw_exit(&ndrp->r_rwlock);
2382				return (error);
2383			}
2384
2385			/*
2386			 * Compare the two vnodes.  If they are the same,
2387			 * just release all held vnodes and return success.
2388			 */
2389			if (ovp == nvp) {
2390				VN_RELE(ovp);
2391				VN_RELE(nvp);
2392				nfs_rw_exit(&odrp->r_rwlock);
2393				nfs_rw_exit(&ndrp->r_rwlock);
2394				return (0);
2395			}
2396
2397			/*
2398			 * Can't mix and match directories and non-
2399			 * directories in rename operations.  We already
2400			 * know that the target is not a directory.  If
2401			 * the source is a directory, return an error.
2402			 */
2403			if (ovp->v_type == VDIR) {
2404				VN_RELE(ovp);
2405				VN_RELE(nvp);
2406				nfs_rw_exit(&odrp->r_rwlock);
2407				nfs_rw_exit(&ndrp->r_rwlock);
2408				return (ENOTDIR);
2409			}
2410
2411			/*
2412			 * The target file exists, is not the same as
2413			 * the source file, and is active.  Link it
2414			 * to a temporary filename to avoid having
2415			 * the server removing the file completely.
2416			 */
2417			tmpname = newname();
2418			error = nfs_link(ndvp, nvp, tmpname, cr);
2419			if (error == EOPNOTSUPP) {
2420				error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2421				    cr);
2422			}
2423			if (error) {
2424				kmem_free(tmpname, MAXNAMELEN);
2425				VN_RELE(ovp);
2426				VN_RELE(nvp);
2427				nfs_rw_exit(&odrp->r_rwlock);
2428				nfs_rw_exit(&ndrp->r_rwlock);
2429				return (error);
2430			}
2431			rp = VTOR(nvp);
2432			mutex_enter(&rp->r_statelock);
2433			if (rp->r_unldvp == NULL) {
2434				VN_HOLD(ndvp);
2435				rp->r_unldvp = ndvp;
2436				if (rp->r_unlcred != NULL)
2437					crfree(rp->r_unlcred);
2438				crhold(cr);
2439				rp->r_unlcred = cr;
2440				rp->r_unlname = tmpname;
2441			} else {
2442				kmem_free(rp->r_unlname, MAXNAMELEN);
2443				rp->r_unlname = tmpname;
2444			}
2445			mutex_exit(&rp->r_statelock);
2446		}
2447
2448		VN_RELE(nvp);
2449	}
2450
2451	if (ovp == NULL) {
2452		/*
2453		 * When renaming directories to be a subdirectory of a
2454		 * different parent, the dnlc entry for ".." will no
2455		 * longer be valid, so it must be removed.
2456		 *
2457		 * We do a lookup here to determine whether we are renaming
2458		 * a directory and we need to check if we are renaming
2459		 * an unlinked file.  This might have already been done
2460		 * in previous code, so we check ovp == NULL to avoid
2461		 * doing it twice.
2462		 */
2463
2464		error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2465
2466		/*
2467		 * The source name *should* already exist.
2468		 */
2469		if (error) {
2470			nfs_rw_exit(&odrp->r_rwlock);
2471			nfs_rw_exit(&ndrp->r_rwlock);
2472			return (error);
2473		}
2474		ASSERT(ovp != NULL);
2475	}
2476
2477	dnlc_remove(odvp, onm);
2478	dnlc_remove(ndvp, nnm);
2479
2480	setdiropargs(&args.rna_from, onm, odvp);
2481	setdiropargs(&args.rna_to, nnm, ndvp);
2482
2483	douprintf = 1;
2484
2485	error = rfs2call(VTOMI(odvp), RFS_RENAME,
2486	    xdr_rnmargs, (caddr_t)&args,
2487	    xdr_enum, (caddr_t)&status, cr,
2488	    &douprintf, &status, 0, NULL);
2489
2490	PURGE_ATTRCACHE(odvp);	/* mod time changed */
2491	PURGE_ATTRCACHE(ndvp);	/* mod time changed */
2492
2493	if (!error) {
2494		error = geterrno(status);
2495		if (!error) {
2496			if (HAVE_RDDIR_CACHE(odrp))
2497				nfs_purge_rddir_cache(odvp);
2498			if (HAVE_RDDIR_CACHE(ndrp))
2499				nfs_purge_rddir_cache(ndvp);
2500			/*
2501			 * when renaming directories to be a subdirectory of a
2502			 * different parent, the dnlc entry for ".." will no
2503			 * longer be valid, so it must be removed
2504			 */
2505			rp = VTOR(ovp);
2506			if (ndvp != odvp) {
2507				if (ovp->v_type == VDIR) {
2508					dnlc_remove(ovp, "..");
2509					if (HAVE_RDDIR_CACHE(rp))
2510						nfs_purge_rddir_cache(ovp);
2511				}
2512			}
2513
2514			/*
2515			 * If we are renaming the unlinked file, update the
2516			 * r_unldvp and r_unlname as needed.
2517			 */
2518			mutex_enter(&rp->r_statelock);
2519			if (rp->r_unldvp != NULL) {
2520				if (strcmp(rp->r_unlname, onm) == 0) {
2521					(void) strncpy(rp->r_unlname,
2522						    nnm, MAXNAMELEN);
2523					rp->r_unlname[MAXNAMELEN - 1] = '\0';
2524
2525					if (ndvp != rp->r_unldvp) {
2526						VN_RELE(rp->r_unldvp);
2527						rp->r_unldvp = ndvp;
2528						VN_HOLD(ndvp);
2529					}
2530				}
2531			}
2532			mutex_exit(&rp->r_statelock);
2533		} else {
2534			/*
2535			 * System V defines rename to return EEXIST, not
2536			 * ENOTEMPTY if the target directory is not empty.
2537			 * Over the wire, the error is NFSERR_ENOTEMPTY
2538			 * which geterrno maps to ENOTEMPTY.
2539			 */
2540			if (error == ENOTEMPTY)
2541				error = EEXIST;
2542		}
2543	}
2544
2545	VN_RELE(ovp);
2546
2547	nfs_rw_exit(&odrp->r_rwlock);
2548	nfs_rw_exit(&ndrp->r_rwlock);
2549
2550	return (error);
2551}
2552
2553static int
2554nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr)
2555{
2556	int error;
2557	struct nfscreatargs args;
2558	struct nfsdiropres dr;
2559	int douprintf;
2560	rnode_t *drp;
2561	hrtime_t t;
2562
2563	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2564		return (EPERM);
2565
2566	setdiropargs(&args.ca_da, nm, dvp);
2567
2568	/*
2569	 * Decide what the group-id and set-gid bit of the created directory
2570	 * should be.  May have to do a setattr to get the gid right.
2571	 */
2572	error = setdirgid(dvp, &va->va_gid, cr);
2573	if (error)
2574		return (error);
2575	error = setdirmode(dvp, &va->va_mode, cr);
2576	if (error)
2577		return (error);
2578	va->va_mask |= AT_MODE|AT_GID;
2579
2580	args.ca_sa = &args.ca_sa_buf;
2581	error = vattr_to_sattr(va, args.ca_sa);
2582	if (error) {
2583		/* req time field(s) overflow - return immediately */
2584		return (error);
2585	}
2586
2587	drp = VTOR(dvp);
2588	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2589		return (EINTR);
2590
2591	dnlc_remove(dvp, nm);
2592
2593	douprintf = 1;
2594
2595	t = gethrtime();
2596
2597	error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2598	    xdr_creatargs, (caddr_t)&args,
2599	    xdr_diropres, (caddr_t)&dr, cr,
2600	    &douprintf, &dr.dr_status, 0, NULL);
2601
2602	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2603
2604	if (!error) {
2605		error = geterrno(dr.dr_status);
2606		if (!error) {
2607			if (HAVE_RDDIR_CACHE(drp))
2608				nfs_purge_rddir_cache(dvp);
2609			/*
2610			 * The attributes returned by RFS_MKDIR can not
2611			 * be depended upon, so mark the attribute cache
2612			 * as purged.  A subsequent GETATTR will get the
2613			 * correct attributes from the server.
2614			 */
2615			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2616			    dvp->v_vfsp, t, cr, NULL, NULL);
2617			PURGE_ATTRCACHE(*vpp);
2618			dnlc_update(dvp, nm, *vpp);
2619
2620			/*
2621			 * Make sure the gid was set correctly.
2622			 * If not, try to set it (but don't lose
2623			 * any sleep over it).
2624			 */
2625			if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2626				va->va_mask = AT_GID;
2627				(void) nfssetattr(*vpp, va, 0, cr);
2628			}
2629		} else {
2630			PURGE_STALE_FH(error, dvp, cr);
2631		}
2632	}
2633
2634	nfs_rw_exit(&drp->r_rwlock);
2635
2636	return (error);
2637}
2638
2639static int
2640nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr)
2641{
2642	int error;
2643	enum nfsstat status;
2644	struct nfsdiropargs da;
2645	vnode_t *vp;
2646	int douprintf;
2647	rnode_t *drp;
2648
2649	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2650		return (EPERM);
2651	drp = VTOR(dvp);
2652	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2653		return (EINTR);
2654
2655	/*
2656	 * Attempt to prevent a rmdir(".") from succeeding.
2657	 */
2658	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2659	if (error) {
2660		nfs_rw_exit(&drp->r_rwlock);
2661		return (error);
2662	}
2663
2664	if (vp == cdir) {
2665		VN_RELE(vp);
2666		nfs_rw_exit(&drp->r_rwlock);
2667		return (EINVAL);
2668	}
2669
2670	setdiropargs(&da, nm, dvp);
2671
2672	/*
2673	 * First just remove the entry from the name cache, as it
2674	 * is most likely an entry for this vp.
2675	 */
2676	dnlc_remove(dvp, nm);
2677
2678	/*
2679	 * If there vnode reference count is greater than one, then
2680	 * there may be additional references in the DNLC which will
2681	 * need to be purged.  First, trying removing the entry for
2682	 * the parent directory and see if that removes the additional
2683	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2684	 * to completely remove any references to the directory which
2685	 * might still exist in the DNLC.
2686	 */
2687	if (vp->v_count > 1) {
2688		dnlc_remove(vp, "..");
2689		if (vp->v_count > 1)
2690			dnlc_purge_vp(vp);
2691	}
2692
2693	douprintf = 1;
2694
2695	error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2696	    xdr_diropargs, (caddr_t)&da,
2697	    xdr_enum, (caddr_t)&status, cr,
2698	    &douprintf, &status, 0, NULL);
2699
2700	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2701
2702	if (error) {
2703		VN_RELE(vp);
2704		nfs_rw_exit(&drp->r_rwlock);
2705		return (error);
2706	}
2707
2708	error = geterrno(status);
2709	if (!error) {
2710		if (HAVE_RDDIR_CACHE(drp))
2711			nfs_purge_rddir_cache(dvp);
2712		if (HAVE_RDDIR_CACHE(VTOR(vp)))
2713			nfs_purge_rddir_cache(vp);
2714	} else {
2715		PURGE_STALE_FH(error, dvp, cr);
2716		/*
2717		 * System V defines rmdir to return EEXIST, not
2718		 * ENOTEMPTY if the directory is not empty.  Over
2719		 * the wire, the error is NFSERR_ENOTEMPTY which
2720		 * geterrno maps to ENOTEMPTY.
2721		 */
2722		if (error == ENOTEMPTY)
2723			error = EEXIST;
2724	}
2725
2726	VN_RELE(vp);
2727
2728	nfs_rw_exit(&drp->r_rwlock);
2729
2730	return (error);
2731}
2732
2733static int
2734nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr)
2735{
2736	int error;
2737	struct nfsslargs args;
2738	enum nfsstat status;
2739	int douprintf;
2740	rnode_t *drp;
2741
2742	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2743		return (EPERM);
2744	setdiropargs(&args.sla_from, lnm, dvp);
2745	args.sla_sa = &args.sla_sa_buf;
2746	error = vattr_to_sattr(tva, args.sla_sa);
2747	if (error) {
2748		/* req time field(s) overflow - return immediately */
2749		return (error);
2750	}
2751	args.sla_tnm = tnm;
2752
2753	drp = VTOR(dvp);
2754	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2755		return (EINTR);
2756
2757	dnlc_remove(dvp, lnm);
2758
2759	douprintf = 1;
2760
2761	error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2762	    xdr_slargs, (caddr_t)&args,
2763	    xdr_enum, (caddr_t)&status, cr,
2764	    &douprintf, &status, 0, NULL);
2765
2766	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2767
2768	if (!error) {
2769		error = geterrno(status);
2770		if (!error) {
2771			if (HAVE_RDDIR_CACHE(drp))
2772				nfs_purge_rddir_cache(dvp);
2773		} else {
2774			PURGE_STALE_FH(error, dvp, cr);
2775		}
2776	}
2777
2778	nfs_rw_exit(&drp->r_rwlock);
2779
2780	return (error);
2781}
2782
2783#ifdef DEBUG
2784static int nfs_readdir_cache_hits = 0;
2785static int nfs_readdir_cache_shorts = 0;
2786static int nfs_readdir_cache_waits = 0;
2787static int nfs_readdir_cache_misses = 0;
2788static int nfs_readdir_readahead = 0;
2789#endif
2790
2791static int nfs_shrinkreaddir = 0;
2792
2793/*
2794 * Read directory entries.
2795 * There are some weird things to look out for here.  The uio_offset
2796 * field is either 0 or it is the offset returned from a previous
2797 * readdir.  It is an opaque value used by the server to find the
2798 * correct directory block to read. The count field is the number
2799 * of blocks to read on the server.  This is advisory only, the server
2800 * may return only one block's worth of entries.  Entries may be compressed
2801 * on the server.
2802 */
2803static int
2804nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp)
2805{
2806	int error;
2807	size_t count;
2808	rnode_t *rp;
2809	rddir_cache *rdc;
2810	rddir_cache *nrdc;
2811	rddir_cache *rrdc;
2812#ifdef DEBUG
2813	int missed;
2814#endif
2815	rddir_cache srdc;
2816	avl_index_t where;
2817
2818	rp = VTOR(vp);
2819
2820	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2821	if (nfs_zone() != VTOMI(vp)->mi_zone)
2822		return (EIO);
2823	/*
2824	 * Make sure that the directory cache is valid.
2825	 */
2826	if (HAVE_RDDIR_CACHE(rp)) {
2827		if (nfs_disable_rddir_cache) {
2828			/*
2829			 * Setting nfs_disable_rddir_cache in /etc/system
2830			 * allows interoperability with servers that do not
2831			 * properly update the attributes of directories.
2832			 * Any cached information gets purged before an
2833			 * access is made to it.
2834			 */
2835			nfs_purge_rddir_cache(vp);
2836		} else {
2837			error = nfs_validate_caches(vp, cr);
2838			if (error)
2839				return (error);
2840		}
2841	}
2842
2843	/*
2844	 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
2845	 * RFS_READDIR request with rda_count set to more than 0x400. So
2846	 * we reduce the request size here purely for compatibility.
2847	 *
2848	 * In general, this is no longer required.  However, if a server
2849	 * is discovered which can not handle requests larger than 1024,
2850	 * nfs_shrinkreaddir can be set to 1 to enable this backwards
2851	 * compatibility.
2852	 *
2853	 * In any case, the request size is limited to NFS_MAXDATA bytes.
2854	 */
2855	count = MIN(uiop->uio_iov->iov_len,
2856	    nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
2857
2858	nrdc = NULL;
2859#ifdef DEBUG
2860	missed = 0;
2861#endif
2862top:
2863	/*
2864	 * Short circuit last readdir which always returns 0 bytes.
2865	 * This can be done after the directory has been read through
2866	 * completely at least once.  This will set r_direof which
2867	 * can be used to find the value of the last cookie.
2868	 */
2869	mutex_enter(&rp->r_statelock);
2870	if (rp->r_direof != NULL &&
2871	    uiop->uio_offset == rp->r_direof->nfs_ncookie) {
2872		mutex_exit(&rp->r_statelock);
2873#ifdef DEBUG
2874		nfs_readdir_cache_shorts++;
2875#endif
2876		if (eofp)
2877			*eofp = 1;
2878		if (nrdc != NULL)
2879			rddir_cache_rele(nrdc);
2880		return (0);
2881	}
2882	/*
2883	 * Look for a cache entry.  Cache entries are identified
2884	 * by the NFS cookie value and the byte count requested.
2885	 */
2886	srdc.nfs_cookie = uiop->uio_offset;
2887	srdc.buflen = count;
2888	rdc = avl_find(&rp->r_dir, &srdc, &where);
2889	if (rdc != NULL) {
2890		rddir_cache_hold(rdc);
2891		/*
2892		 * If the cache entry is in the process of being
2893		 * filled in, wait until this completes.  The
2894		 * RDDIRWAIT bit is set to indicate that someone
2895		 * is waiting and then the thread currently
2896		 * filling the entry is done, it should do a
2897		 * cv_broadcast to wakeup all of the threads
2898		 * waiting for it to finish.
2899		 */
2900		if (rdc->flags & RDDIR) {
2901			nfs_rw_exit(&rp->r_rwlock);
2902			rdc->flags |= RDDIRWAIT;
2903#ifdef DEBUG
2904			nfs_readdir_cache_waits++;
2905#endif
2906			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2907				/*
2908				 * We got interrupted, probably
2909				 * the user typed ^C or an alarm
2910				 * fired.  We free the new entry
2911				 * if we allocated one.
2912				 */
2913				mutex_exit(&rp->r_statelock);
2914				(void) nfs_rw_enter_sig(&rp->r_rwlock,
2915					RW_READER, FALSE);
2916				rddir_cache_rele(rdc);
2917				if (nrdc != NULL)
2918					rddir_cache_rele(nrdc);
2919				return (EINTR);
2920			}
2921			mutex_exit(&rp->r_statelock);
2922			(void) nfs_rw_enter_sig(&rp->r_rwlock,
2923				RW_READER, FALSE);
2924			rddir_cache_rele(rdc);
2925			goto top;
2926		}
2927		/*
2928		 * Check to see if a readdir is required to
2929		 * fill the entry.  If so, mark this entry
2930		 * as being filled, remove our reference,
2931		 * and branch to the code to fill the entry.
2932		 */
2933		if (rdc->flags & RDDIRREQ) {
2934			rdc->flags &= ~RDDIRREQ;
2935			rdc->flags |= RDDIR;
2936			if (nrdc != NULL)
2937				rddir_cache_rele(nrdc);
2938			nrdc = rdc;
2939			mutex_exit(&rp->r_statelock);
2940			goto bottom;
2941		}
2942#ifdef DEBUG
2943		if (!missed)
2944			nfs_readdir_cache_hits++;
2945#endif
2946		/*
2947		 * If an error occurred while attempting
2948		 * to fill the cache entry, just return it.
2949		 */
2950		if (rdc->error) {
2951			error = rdc->error;
2952			mutex_exit(&rp->r_statelock);
2953			rddir_cache_rele(rdc);
2954			if (nrdc != NULL)
2955				rddir_cache_rele(nrdc);
2956			return (error);
2957		}
2958
2959		/*
2960		 * The cache entry is complete and good,
2961		 * copyout the dirent structs to the calling
2962		 * thread.
2963		 */
2964		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
2965
2966		/*
2967		 * If no error occurred during the copyout,
2968		 * update the offset in the uio struct to
2969		 * contain the value of the next cookie
2970		 * and set the eof value appropriately.
2971		 */
2972		if (!error) {
2973			uiop->uio_offset = rdc->nfs_ncookie;
2974			if (eofp)
2975				*eofp = rdc->eof;
2976		}
2977
2978		/*
2979		 * Decide whether to do readahead.  Don't if
2980		 * have already read to the end of directory.
2981		 */
2982		if (rdc->eof) {
2983			rp->r_direof = rdc;
2984			mutex_exit(&rp->r_statelock);
2985			rddir_cache_rele(rdc);
2986			if (nrdc != NULL)
2987				rddir_cache_rele(nrdc);
2988			return (error);
2989		}
2990
2991		/*
2992		 * Check to see whether we found an entry
2993		 * for the readahead.  If so, we don't need
2994		 * to do anything further, so free the new
2995		 * entry if one was allocated.  Otherwise,
2996		 * allocate a new entry, add it to the cache,
2997		 * and then initiate an asynchronous readdir
2998		 * operation to fill it.
2999		 */
3000		srdc.nfs_cookie = rdc->nfs_ncookie;
3001		srdc.buflen = count;
3002		rrdc = avl_find(&rp->r_dir, &srdc, &where);
3003		if (rrdc != NULL) {
3004			if (nrdc != NULL)
3005				rddir_cache_rele(nrdc);
3006		} else {
3007			if (nrdc != NULL)
3008				rrdc = nrdc;
3009			else {
3010				rrdc = rddir_cache_alloc(KM_NOSLEEP);
3011			}
3012			if (rrdc != NULL) {
3013				rrdc->nfs_cookie = rdc->nfs_ncookie;
3014				rrdc->buflen = count;
3015				avl_insert(&rp->r_dir, rrdc, where);
3016				rddir_cache_hold(rrdc);
3017				mutex_exit(&rp->r_statelock);
3018				rddir_cache_rele(rdc);
3019#ifdef DEBUG
3020				nfs_readdir_readahead++;
3021#endif
3022				nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3023				return (error);
3024			}
3025		}
3026
3027		mutex_exit(&rp->r_statelock);
3028		rddir_cache_rele(rdc);
3029		return (error);
3030	}
3031
3032	/*
3033	 * Didn't find an entry in the cache.  Construct a new empty
3034	 * entry and link it into the cache.  Other processes attempting
3035	 * to access this entry will need to wait until it is filled in.
3036	 *
3037	 * Since kmem_alloc may block, another pass through the cache
3038	 * will need to be taken to make sure that another process
3039	 * hasn't already added an entry to the cache for this request.
3040	 */
3041	if (nrdc == NULL) {
3042		mutex_exit(&rp->r_statelock);
3043		nrdc = rddir_cache_alloc(KM_SLEEP);
3044		nrdc->nfs_cookie = uiop->uio_offset;
3045		nrdc->buflen = count;
3046		goto top;
3047	}
3048
3049	/*
3050	 * Add this entry to the cache.
3051	 */
3052	avl_insert(&rp->r_dir, nrdc, where);
3053	rddir_cache_hold(nrdc);
3054	mutex_exit(&rp->r_statelock);
3055
3056bottom:
3057#ifdef DEBUG
3058	missed = 1;
3059	nfs_readdir_cache_misses++;
3060#endif
3061	/*
3062	 * Do the readdir.
3063	 */
3064	error = nfsreaddir(vp, nrdc, cr);
3065
3066	/*
3067	 * If this operation failed, just return the error which occurred.
3068	 */
3069	if (error != 0)
3070		return (error);
3071
3072	/*
3073	 * Since the RPC operation will have taken sometime and blocked
3074	 * this process, another pass through the cache will need to be
3075	 * taken to find the correct cache entry.  It is possible that
3076	 * the correct cache entry will not be there (although one was
3077	 * added) because the directory changed during the RPC operation
3078	 * and the readdir cache was flushed.  In this case, just start
3079	 * over.  It is hoped that this will not happen too often... :-)
3080	 */
3081	nrdc = NULL;
3082	goto top;
3083	/* NOTREACHED */
3084}
3085
3086static int
3087nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3088{
3089	int error;
3090	struct nfsrddirargs rda;
3091	struct nfsrddirres rd;
3092	rnode_t *rp;
3093	mntinfo_t *mi;
3094	uint_t count;
3095	int douprintf;
3096	failinfo_t fi, *fip;
3097
3098	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3099	count = rdc->buflen;
3100
3101	rp = VTOR(vp);
3102	mi = VTOMI(vp);
3103
3104	rda.rda_fh = *VTOFH(vp);
3105	rda.rda_offset = rdc->nfs_cookie;
3106
3107	/*
3108	 * NFS client failover support
3109	 * suppress failover unless we have a zero cookie
3110	 */
3111	if (rdc->nfs_cookie == (off_t)0) {
3112		fi.vp = vp;
3113		fi.fhp = (caddr_t)&rda.rda_fh;
3114		fi.copyproc = nfscopyfh;
3115		fi.lookupproc = nfslookup;
3116		fi.xattrdirproc = acl_getxattrdir2;
3117		fip = &fi;
3118	} else {
3119		fip = NULL;
3120	}
3121
3122	rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3123	rd.rd_size = count;
3124	rd.rd_offset = rda.rda_offset;
3125
3126	douprintf = 1;
3127
3128	if (mi->mi_io_kstats) {
3129		mutex_enter(&mi->mi_lock);
3130		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3131		mutex_exit(&mi->mi_lock);
3132	}
3133
3134	do {
3135		rda.rda_count = MIN(count, mi->mi_curread);
3136		error = rfs2call(mi, RFS_READDIR,
3137		    xdr_rddirargs, (caddr_t)&rda,
3138		    xdr_getrddirres, (caddr_t)&rd, cr,
3139		    &douprintf, &rd.rd_status, 0, fip);
3140	} while (error == ENFS_TRYAGAIN);
3141
3142	if (mi->mi_io_kstats) {
3143		mutex_enter(&mi->mi_lock);
3144		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3145		mutex_exit(&mi->mi_lock);
3146	}
3147
3148	/*
3149	 * Since we are actually doing a READDIR RPC, we must have
3150	 * exclusive access to the cache entry being filled.  Thus,
3151	 * it is safe to update all fields except for the flags
3152	 * field.  The r_statelock in the rnode must be held to
3153	 * prevent two different threads from simultaneously
3154	 * attempting to update the flags field.  This can happen
3155	 * if we are turning off RDDIR and the other thread is
3156	 * trying to set RDDIRWAIT.
3157	 */
3158	ASSERT(rdc->flags & RDDIR);
3159	if (!error) {
3160		error = geterrno(rd.rd_status);
3161		if (!error) {
3162			rdc->nfs_ncookie = rd.rd_offset;
3163			rdc->eof = rd.rd_eof ? 1 : 0;
3164			rdc->entlen = rd.rd_size;
3165			ASSERT(rdc->entlen <= rdc->buflen);
3166#ifdef DEBUG
3167			rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3168			    KM_SLEEP);
3169#else
3170			rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3171#endif
3172			bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3173			rdc->error = 0;
3174			if (mi->mi_io_kstats) {
3175				mutex_enter(&mi->mi_lock);
3176				KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3177				KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3178				    rd.rd_size;
3179				mutex_exit(&mi->mi_lock);
3180			}
3181		} else {
3182			PURGE_STALE_FH(error, vp, cr);
3183		}
3184	}
3185	if (error) {
3186		rdc->entries = NULL;
3187		rdc->error = error;
3188	}
3189	kmem_free(rd.rd_entries, rdc->buflen);
3190
3191	mutex_enter(&rp->r_statelock);
3192	rdc->flags &= ~RDDIR;
3193	if (rdc->flags & RDDIRWAIT) {
3194		rdc->flags &= ~RDDIRWAIT;
3195		cv_broadcast(&rdc->cv);
3196	}
3197	if (error)
3198		rdc->flags |= RDDIRREQ;
3199	mutex_exit(&rp->r_statelock);
3200
3201	rddir_cache_rele(rdc);
3202
3203	return (error);
3204}
3205
3206#ifdef DEBUG
3207static int nfs_bio_do_stop = 0;
3208#endif
3209
3210static int
3211nfs_bio(struct buf *bp, cred_t *cr)
3212{
3213	rnode_t *rp = VTOR(bp->b_vp);
3214	int count;
3215	int error;
3216	cred_t *cred;
3217	uint_t offset;
3218
3219	DTRACE_IO1(start, struct buf *, bp);
3220
3221	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3222	offset = dbtob(bp->b_blkno);
3223
3224	if (bp->b_flags & B_READ) {
3225		mutex_enter(&rp->r_statelock);
3226		if (rp->r_cred != NULL) {
3227			cred = rp->r_cred;
3228			crhold(cred);
3229		} else {
3230			rp->r_cred = cr;
3231			crhold(cr);
3232			cred = cr;
3233			crhold(cred);
3234		}
3235		mutex_exit(&rp->r_statelock);
3236	read_again:
3237		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3238		    offset, bp->b_bcount, &bp->b_resid, cred);
3239		crfree(cred);
3240		if (!error) {
3241			if (bp->b_resid) {
3242				/*
3243				 * Didn't get it all because we hit EOF,
3244				 * zero all the memory beyond the EOF.
3245				 */
3246				/* bzero(rdaddr + */
3247				bzero(bp->b_un.b_addr +
3248				    bp->b_bcount - bp->b_resid, bp->b_resid);
3249			}
3250			mutex_enter(&rp->r_statelock);
3251			if (bp->b_resid == bp->b_bcount &&
3252			    offset >= rp->r_size) {
3253				/*
3254				 * We didn't read anything at all as we are
3255				 * past EOF.  Return an error indicator back
3256				 * but don't destroy the pages (yet).
3257				 */
3258				error = NFS_EOF;
3259			}
3260			mutex_exit(&rp->r_statelock);
3261		} else if (error == EACCES) {
3262			mutex_enter(&rp->r_statelock);
3263			if (cred != cr) {
3264				if (rp->r_cred != NULL)
3265					crfree(rp->r_cred);
3266				rp->r_cred = cr;
3267				crhold(cr);
3268				cred = cr;
3269				crhold(cred);
3270				mutex_exit(&rp->r_statelock);
3271				goto read_again;
3272			}
3273			mutex_exit(&rp->r_statelock);
3274		}
3275	} else {
3276		if (!(rp->r_flags & RSTALE)) {
3277			mutex_enter(&rp->r_statelock);
3278			if (rp->r_cred != NULL) {
3279				cred = rp->r_cred;
3280				crhold(cred);
3281			} else {
3282				rp->r_cred = cr;
3283				crhold(cr);
3284				cred = cr;
3285				crhold(cred);
3286			}
3287			mutex_exit(&rp->r_statelock);
3288		write_again:
3289			mutex_enter(&rp->r_statelock);
3290			count = MIN(bp->b_bcount, rp->r_size - offset);
3291			mutex_exit(&rp->r_statelock);
3292			if (count < 0)
3293				cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3294#ifdef DEBUG
3295			if (count == 0) {
3296				zcmn_err(getzoneid(), CE_WARN,
3297				    "nfs_bio: zero length write at %d",
3298				    offset);
3299				nfs_printfhandle(&rp->r_fh);
3300				if (nfs_bio_do_stop)
3301					debug_enter("nfs_bio");
3302			}
3303#endif
3304			error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3305			    count, cred);
3306			if (error == EACCES) {
3307				mutex_enter(&rp->r_statelock);
3308				if (cred != cr) {
3309					if (rp->r_cred != NULL)
3310						crfree(rp->r_cred);
3311					rp->r_cred = cr;
3312					crhold(cr);
3313					crfree(cred);
3314					cred = cr;
3315					crhold(cred);
3316					mutex_exit(&rp->r_statelock);
3317					goto write_again;
3318				}
3319				mutex_exit(&rp->r_statelock);
3320			}
3321			bp->b_error = error;
3322			if (error && error != EINTR) {
3323				/*
3324				 * Don't print EDQUOT errors on the console.
3325				 * Don't print asynchronous EACCES errors.
3326				 * Don't print EFBIG errors.
3327				 * Print all other write errors.
3328				 */
3329				if (error != EDQUOT && error != EFBIG &&
3330				    (error != EACCES ||
3331				    !(bp->b_flags & B_ASYNC)))
3332					nfs_write_error(bp->b_vp, error, cred);
3333				/*
3334				 * Update r_error and r_flags as appropriate.
3335				 * If the error was ESTALE, then mark the
3336				 * rnode as not being writeable and save
3337				 * the error status.  Otherwise, save any
3338				 * errors which occur from asynchronous
3339				 * page invalidations.  Any errors occurring
3340				 * from other operations should be saved
3341				 * by the caller.
3342				 */
3343				mutex_enter(&rp->r_statelock);
3344				if (error == ESTALE) {
3345					rp->r_flags |= RSTALE;
3346					if (!rp->r_error)
3347						rp->r_error = error;
3348				} else if (!rp->r_error &&
3349				    (bp->b_flags &
3350				    (B_INVAL|B_FORCE|B_ASYNC)) ==
3351				    (B_INVAL|B_FORCE|B_ASYNC)) {
3352					rp->r_error = error;
3353				}
3354				mutex_exit(&rp->r_statelock);
3355			}
3356			crfree(cred);
3357		} else
3358			error = rp->r_error;
3359	}
3360
3361	if (error != 0 && error != NFS_EOF)
3362		bp->b_flags |= B_ERROR;
3363
3364	DTRACE_IO1(done, struct buf *, bp);
3365
3366	return (error);
3367}
3368
3369static int
3370nfs_fid(vnode_t *vp, fid_t *fidp)
3371{
3372	struct nfs_fid *fp;
3373	rnode_t *rp;
3374
3375	rp = VTOR(vp);
3376
3377	if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3378		fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3379		return (ENOSPC);
3380	}
3381	fp = (struct nfs_fid *)fidp;
3382	fp->nf_pad = 0;
3383	fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3384	bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3385	return (0);
3386}
3387
3388/* ARGSUSED2 */
3389static int
3390nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3391{
3392	rnode_t *rp = VTOR(vp);
3393
3394	if (!write_lock) {
3395		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3396		return (V_WRITELOCK_FALSE);
3397	}
3398
3399	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3400		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3401		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3402			return (V_WRITELOCK_FALSE);
3403		nfs_rw_exit(&rp->r_rwlock);
3404	}
3405
3406	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3407	return (V_WRITELOCK_TRUE);
3408}
3409
3410/* ARGSUSED */
3411static void
3412nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3413{
3414	rnode_t *rp = VTOR(vp);
3415
3416	nfs_rw_exit(&rp->r_rwlock);
3417}
3418
3419/* ARGSUSED */
3420static int
3421nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp)
3422{
3423
3424	/*
3425	 * Because we stuff the readdir cookie into the offset field
3426	 * someone may attempt to do an lseek with the cookie which
3427	 * we want to succeed.
3428	 */
3429	if (vp->v_type == VDIR)
3430		return (0);
3431	if (*noffp < 0 || *noffp > MAXOFF32_T)
3432		return (EINVAL);
3433	return (0);
3434}
3435
3436/*
3437 * number of NFS_MAXDATA blocks to read ahead
3438 * optimized for 100 base-T.
3439 */
3440static int nfs_nra = 4;
3441
3442#ifdef DEBUG
3443static int nfs_lostpage = 0;	/* number of times we lost original page */
3444#endif
3445
3446/*
3447 * Return all the pages from [off..off+len) in file
3448 */
3449static int
3450nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3451	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3452	enum seg_rw rw, cred_t *cr)
3453{
3454	rnode_t *rp;
3455	int error;
3456	mntinfo_t *mi;
3457
3458	if (vp->v_flag & VNOMAP)
3459		return (ENOSYS);
3460
3461	ASSERT(off <= MAXOFF32_T);
3462	if (nfs_zone() != VTOMI(vp)->mi_zone)
3463		return (EIO);
3464	if (protp != NULL)
3465		*protp = PROT_ALL;
3466
3467	/*
3468	 * Now valididate that the caches are up to date.
3469	 */
3470	error = nfs_validate_caches(vp, cr);
3471	if (error)
3472		return (error);
3473
3474	rp = VTOR(vp);
3475	mi = VTOMI(vp);
3476retry:
3477	mutex_enter(&rp->r_statelock);
3478
3479	/*
3480	 * Don't create dirty pages faster than they
3481	 * can be cleaned so that the system doesn't
3482	 * get imbalanced.  If the async queue is
3483	 * maxed out, then wait for it to drain before
3484	 * creating more dirty pages.  Also, wait for
3485	 * any threads doing pagewalks in the vop_getattr
3486	 * entry points so that they don't block for
3487	 * long periods.
3488	 */
3489	if (rw == S_CREATE) {
3490		while ((mi->mi_max_threads != 0 &&
3491		    rp->r_awcount > 2 * mi->mi_max_threads) ||
3492		    rp->r_gcount > 0)
3493			cv_wait(&rp->r_cv, &rp->r_statelock);
3494	}
3495
3496	/*
3497	 * If we are getting called as a side effect of an nfs_write()
3498	 * operation the local file size might not be extended yet.
3499	 * In this case we want to be able to return pages of zeroes.
3500	 */
3501	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3502		mutex_exit(&rp->r_statelock);
3503		return (EFAULT);		/* beyond EOF */
3504	}
3505
3506	mutex_exit(&rp->r_statelock);
3507
3508	if (len <= PAGESIZE) {
3509		error = nfs_getapage(vp, off, len, protp, pl, plsz,
3510		    seg, addr, rw, cr);
3511	} else {
3512		error = pvn_getpages(nfs_getapage, vp, off, len, protp,
3513		    pl, plsz, seg, addr, rw, cr);
3514	}
3515
3516	switch (error) {
3517	case NFS_EOF:
3518		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3519		goto retry;
3520	case ESTALE:
3521		PURGE_STALE_FH(error, vp, cr);
3522	}
3523
3524	return (error);
3525}
3526
3527/*
3528 * Called from pvn_getpages or nfs_getpage to get a particular page.
3529 */
3530/* ARGSUSED */
3531static int
3532nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3533	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3534	enum seg_rw rw, cred_t *cr)
3535{
3536	rnode_t *rp;
3537	uint_t bsize;
3538	struct buf *bp;
3539	page_t *pp;
3540	u_offset_t lbn;
3541	u_offset_t io_off;
3542	u_offset_t blkoff;
3543	u_offset_t rablkoff;
3544	size_t io_len;
3545	uint_t blksize;
3546	int error;
3547	int readahead;
3548	int readahead_issued = 0;
3549	int ra_window; /* readahead window */
3550	page_t *pagefound;
3551
3552	if (nfs_zone() != VTOMI(vp)->mi_zone)
3553		return (EIO);
3554	rp = VTOR(vp);
3555	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3556
3557reread:
3558	bp = NULL;
3559	pp = NULL;
3560	pagefound = NULL;
3561
3562	if (pl != NULL)
3563		pl[0] = NULL;
3564
3565	error = 0;
3566	lbn = off / bsize;
3567	blkoff = lbn * bsize;
3568
3569	/*
3570	 * Queueing up the readahead before doing the synchronous read
3571	 * results in a significant increase in read throughput because
3572	 * of the increased parallelism between the async threads and
3573	 * the process context.
3574	 */
3575	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3576	    rw != S_CREATE &&
3577	    !(vp->v_flag & VNOCACHE)) {
3578		mutex_enter(&rp->r_statelock);
3579
3580		/*
3581		 * Calculate the number of readaheads to do.
3582		 * a) No readaheads at offset = 0.
3583		 * b) Do maximum(nfs_nra) readaheads when the readahead
3584		 *    window is closed.
3585		 * c) Do readaheads between 1 to (nfs_nra - 1) depending
3586		 *    upon how far the readahead window is open or close.
3587		 * d) No readaheads if rp->r_nextr is not within the scope
3588		 *    of the readahead window (random i/o).
3589		 */
3590
3591		if (off == 0)
3592			readahead = 0;
3593		else if (blkoff == rp->r_nextr)
3594			readahead = nfs_nra;
3595		else if (rp->r_nextr > blkoff &&
3596				((ra_window = (rp->r_nextr - blkoff) / bsize)
3597					<= (nfs_nra - 1)))
3598			readahead = nfs_nra - ra_window;
3599		else
3600			readahead = 0;
3601
3602		rablkoff = rp->r_nextr;
3603		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3604			mutex_exit(&rp->r_statelock);
3605			if (nfs_async_readahead(vp, rablkoff + bsize,
3606			    addr + (rablkoff + bsize - off), seg, cr,
3607			    nfs_readahead) < 0) {
3608				mutex_enter(&rp->r_statelock);
3609				break;
3610			}
3611			readahead--;
3612			rablkoff += bsize;
3613			/*
3614			 * Indicate that we did a readahead so
3615			 * readahead offset is not updated
3616			 * by the synchronous read below.
3617			 */
3618			readahead_issued = 1;
3619			mutex_enter(&rp->r_statelock);
3620			/*
3621			 * set readahead offset to
3622			 * offset of last async readahead
3623			 * request.
3624			 */
3625			rp->r_nextr = rablkoff;
3626		}
3627		mutex_exit(&rp->r_statelock);
3628	}
3629
3630again:
3631	if ((pagefound = page_exists(vp, off)) == NULL) {
3632		if (pl == NULL) {
3633			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3634			    nfs_readahead);
3635		} else if (rw == S_CREATE) {
3636			/*
3637			 * Block for this page is not allocated, or the offset
3638			 * is beyond the current allocation size, or we're
3639			 * allocating a swap slot and the page was not found,
3640			 * so allocate it and return a zero page.
3641			 */
3642			if ((pp = page_create_va(vp, off,
3643			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3644				cmn_err(CE_PANIC, "nfs_getapage: page_create");
3645			io_len = PAGESIZE;
3646			mutex_enter(&rp->r_statelock);
3647			rp->r_nextr = off + PAGESIZE;
3648			mutex_exit(&rp->r_statelock);
3649		} else {
3650			/*
3651			 * Need to go to server to get a BLOCK, exception to
3652			 * that being while reading at offset = 0 or doing
3653			 * random i/o, in that case read only a PAGE.
3654			 */
3655			mutex_enter(&rp->r_statelock);
3656			if (blkoff < rp->r_size &&
3657			    blkoff + bsize >= rp->r_size) {
3658				/*
3659				 * If only a block or less is left in
3660				 * the file, read all that is remaining.
3661				 */
3662				if (rp->r_size <= off) {
3663					/*
3664					 * Trying to access beyond EOF,
3665					 * set up to get at least one page.
3666					 */
3667					blksize = off + PAGESIZE - blkoff;
3668				} else
3669					blksize = rp->r_size - blkoff;
3670			} else if ((off == 0) ||
3671				(off != rp->r_nextr && !readahead_issued)) {
3672				blksize = PAGESIZE;
3673				blkoff = off; /* block = page here */
3674			} else
3675				blksize = bsize;
3676			mutex_exit(&rp->r_statelock);
3677
3678			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3679			    &io_len, blkoff, blksize, 0);
3680
3681			/*
3682			 * Some other thread has entered the page,
3683			 * so just use it.
3684			 */
3685			if (pp == NULL)
3686				goto again;
3687
3688			/*
3689			 * Now round the request size up to page boundaries.
3690			 * This ensures that the entire page will be
3691			 * initialized to zeroes if EOF is encountered.
3692			 */
3693			io_len = ptob(btopr(io_len));
3694
3695			bp = pageio_setup(pp, io_len, vp, B_READ);
3696			ASSERT(bp != NULL);
3697
3698			/*
3699			 * pageio_setup should have set b_addr to 0.  This
3700			 * is correct since we want to do I/O on a page
3701			 * boundary.  bp_mapin will use this addr to calculate
3702			 * an offset, and then set b_addr to the kernel virtual
3703			 * address it allocated for us.
3704			 */
3705			ASSERT(bp->b_un.b_addr == 0);
3706
3707			bp->b_edev = 0;
3708			bp->b_dev = 0;
3709			bp->b_lblkno = lbtodb(io_off);
3710			bp->b_file = vp;
3711			bp->b_offset = (offset_t)off;
3712			bp_mapin(bp);
3713
3714			/*
3715			 * If doing a write beyond what we believe is EOF,
3716			 * don't bother trying to read the pages from the
3717			 * server, we'll just zero the pages here.  We
3718			 * don't check that the rw flag is S_WRITE here
3719			 * because some implementations may attempt a
3720			 * read access to the buffer before copying data.
3721			 */
3722			mutex_enter(&rp->r_statelock);
3723			if (io_off >= rp->r_size && seg == segkmap) {
3724				mutex_exit(&rp->r_statelock);
3725				bzero(bp->b_un.b_addr, io_len);
3726			} else {
3727				mutex_exit(&rp->r_statelock);
3728				error = nfs_bio(bp, cr);
3729			}
3730
3731			/*
3732			 * Unmap the buffer before freeing it.
3733			 */
3734			bp_mapout(bp);
3735			pageio_done(bp);
3736
3737			if (error == NFS_EOF) {
3738				/*
3739				 * If doing a write system call just return
3740				 * zeroed pages, else user tried to get pages
3741				 * beyond EOF, return error.  We don't check
3742				 * that the rw flag is S_WRITE here because
3743				 * some implementations may attempt a read
3744				 * access to the buffer before copying data.
3745				 */
3746				if (seg == segkmap)
3747					error = 0;
3748				else
3749					error = EFAULT;
3750			}
3751
3752			if (!readahead_issued && !error) {
3753			    mutex_enter(&rp->r_statelock);
3754			    rp->r_nextr = io_off + io_len;
3755			    mutex_exit(&rp->r_statelock);
3756			}
3757		}
3758	}
3759
3760out:
3761	if (pl == NULL)
3762		return (error);
3763
3764	if (error) {
3765		if (pp != NULL)
3766			pvn_read_done(pp, B_ERROR);
3767		return (error);
3768	}
3769
3770	if (pagefound) {
3771		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3772
3773		/*
3774		 * Page exists in the cache, acquire the appropriate lock.
3775		 * If this fails, start all over again.
3776		 */
3777		if ((pp = page_lookup(vp, off, se)) == NULL) {
3778#ifdef DEBUG
3779			nfs_lostpage++;
3780#endif
3781			goto reread;
3782		}
3783		pl[0] = pp;
3784		pl[1] = NULL;
3785		return (0);
3786	}
3787
3788	if (pp != NULL)
3789		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3790
3791	return (error);
3792}
3793
3794static void
3795nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3796	cred_t *cr)
3797{
3798	int error;
3799	page_t *pp;
3800	u_offset_t io_off;
3801	size_t io_len;
3802	struct buf *bp;
3803	uint_t bsize, blksize;
3804	rnode_t *rp = VTOR(vp);
3805
3806	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3807
3808	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3809
3810	mutex_enter(&rp->r_statelock);
3811	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3812		/*
3813		 * If less than a block left in file read less
3814		 * than a block.
3815		 */
3816		blksize = rp->r_size - blkoff;
3817	} else
3818		blksize = bsize;
3819	mutex_exit(&rp->r_statelock);
3820
3821	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3822	    &io_off, &io_len, blkoff, blksize, 1);
3823	/*
3824	 * The isra flag passed to the kluster function is 1, we may have
3825	 * gotten a return value of NULL for a variety of reasons (# of free
3826	 * pages < minfree, someone entered the page on the vnode etc). In all
3827	 * cases, we want to punt on the readahead.
3828	 */
3829	if (pp == NULL)
3830		return;
3831
3832	/*
3833	 * Now round the request size up to page boundaries.
3834	 * This ensures that the entire page will be
3835	 * initialized to zeroes if EOF is encountered.
3836	 */
3837	io_len = ptob(btopr(io_len));
3838
3839	bp = pageio_setup(pp, io_len, vp, B_READ);
3840	ASSERT(bp != NULL);
3841
3842	/*
3843	 * pageio_setup should have set b_addr to 0.  This is correct since
3844	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
3845	 * to calculate an offset, and then set b_addr to the kernel virtual
3846	 * address it allocated for us.
3847	 */
3848	ASSERT(bp->b_un.b_addr == 0);
3849
3850	bp->b_edev = 0;
3851	bp->b_dev = 0;
3852	bp->b_lblkno = lbtodb(io_off);
3853	bp->b_file = vp;
3854	bp->b_offset = (offset_t)blkoff;
3855	bp_mapin(bp);
3856
3857	/*
3858	 * If doing a write beyond what we believe is EOF, don't bother trying
3859	 * to read the pages from the server, we'll just zero the pages here.
3860	 * We don't check that the rw flag is S_WRITE here because some
3861	 * implementations may attempt a read access to the buffer before
3862	 * copying data.
3863	 */
3864	mutex_enter(&rp->r_statelock);
3865	if (io_off >= rp->r_size && seg == segkmap) {
3866		mutex_exit(&rp->r_statelock);
3867		bzero(bp->b_un.b_addr, io_len);
3868		error = 0;
3869	} else {
3870		mutex_exit(&rp->r_statelock);
3871		error = nfs_bio(bp, cr);
3872		if (error == NFS_EOF)
3873			error = 0;
3874	}
3875
3876	/*
3877	 * Unmap the buffer before freeing it.
3878	 */
3879	bp_mapout(bp);
3880	pageio_done(bp);
3881
3882	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
3883
3884	/*
3885	 * In case of error set readahead offset
3886	 * to the lowest offset.
3887	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
3888	 */
3889	if (error && rp->r_nextr > io_off) {
3890		mutex_enter(&rp->r_statelock);
3891		if (rp->r_nextr > io_off)
3892			rp->r_nextr = io_off;
3893		mutex_exit(&rp->r_statelock);
3894	}
3895}
3896
3897/*
3898 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
3899 * If len == 0, do from off to EOF.
3900 *
3901 * The normal cases should be len == 0 && off == 0 (entire vp list),
3902 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
3903 * (from pageout).
3904 */
3905static int
3906nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr)
3907{
3908	int error;
3909	rnode_t *rp;
3910
3911	ASSERT(cr != NULL);
3912
3913	/*
3914	 * XXX - Why should this check be made here?
3915	 */
3916	if (vp->v_flag & VNOMAP)
3917		return (ENOSYS);
3918
3919	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
3920		return (0);
3921
3922	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
3923		return (EIO);
3924	ASSERT(off <= MAXOFF32_T);
3925
3926	rp = VTOR(vp);
3927	mutex_enter(&rp->r_statelock);
3928	rp->r_count++;
3929	mutex_exit(&rp->r_statelock);
3930	error = nfs_putpages(vp, off, len, flags, cr);
3931	mutex_enter(&rp->r_statelock);
3932	rp->r_count--;
3933	cv_broadcast(&rp->r_cv);
3934	mutex_exit(&rp->r_statelock);
3935
3936	return (error);
3937}
3938
3939/*
3940 * Write out a single page, possibly klustering adjacent dirty pages.
3941 */
3942int
3943nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
3944	int flags, cred_t *cr)
3945{
3946	u_offset_t io_off;
3947	u_offset_t lbn_off;
3948	u_offset_t lbn;
3949	size_t io_len;
3950	uint_t bsize;
3951	int error;
3952	rnode_t *rp;
3953
3954	ASSERT(!vn_is_readonly(vp));
3955	ASSERT(pp != NULL);
3956	ASSERT(cr != NULL);
3957	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
3958
3959	rp = VTOR(vp);
3960	ASSERT(rp->r_count > 0);
3961
3962	ASSERT(pp->p_offset <= MAXOFF32_T);
3963
3964	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3965	lbn = pp->p_offset / bsize;
3966	lbn_off = lbn * bsize;
3967
3968	/*
3969	 * Find a kluster that fits in one block, or in
3970	 * one page if pages are bigger than blocks.  If
3971	 * there is less file space allocated than a whole
3972	 * page, we'll shorten the i/o request below.
3973	 */
3974	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
3975	    roundup(bsize, PAGESIZE), flags);
3976
3977	/*
3978	 * pvn_write_kluster shouldn't have returned a page with offset
3979	 * behind the original page we were given.  Verify that.
3980	 */
3981	ASSERT((pp->p_offset / bsize) >= lbn);
3982
3983	/*
3984	 * Now pp will have the list of kept dirty pages marked for
3985	 * write back.  It will also handle invalidation and freeing
3986	 * of pages that are not dirty.  Check for page length rounding
3987	 * problems.
3988	 */
3989	if (io_off + io_len > lbn_off + bsize) {
3990		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
3991		io_len = lbn_off + bsize - io_off;
3992	}
3993	/*
3994	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
3995	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
3996	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
3997	 * progress and the r_size has not been made consistent with the
3998	 * new size of the file. When the uiomove() completes the r_size is
3999	 * updated and the RMODINPROGRESS flag is cleared.
4000	 *
4001	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4002	 * consistent value of r_size. Without this handshaking, it is
4003	 * possible that nfs(3)_bio() picks  up the old value of r_size
4004	 * before the uiomove() in writerp() completes. This will result
4005	 * in the write through nfs(3)_bio() being dropped.
4006	 *
4007	 * More precisely, there is a window between the time the uiomove()
4008	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4009	 * operation intervenes in this window, the page will be picked up,
4010	 * because it is dirty (it will be unlocked, unless it was
4011	 * pagecreate'd). When the page is picked up as dirty, the dirty
4012	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4013	 * checked. This will still be the old size. Therefore the page will
4014	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4015	 * the page will be found to be clean and the write will be dropped.
4016	 */
4017	if (rp->r_flags & RMODINPROGRESS) {
4018		mutex_enter(&rp->r_statelock);
4019		if ((rp->r_flags & RMODINPROGRESS) &&
4020		    rp->r_modaddr + MAXBSIZE > io_off &&
4021		    rp->r_modaddr < io_off + io_len) {
4022			page_t *plist;
4023			/*
4024			 * A write is in progress for this region of the file.
4025			 * If we did not detect RMODINPROGRESS here then this
4026			 * path through nfs_putapage() would eventually go to
4027			 * nfs(3)_bio() and may not write out all of the data
4028			 * in the pages. We end up losing data. So we decide
4029			 * to set the modified bit on each page in the page
4030			 * list and mark the rnode with RDIRTY. This write
4031			 * will be restarted at some later time.
4032			 */
4033			plist = pp;
4034			while (plist != NULL) {
4035				pp = plist;
4036				page_sub(&plist, pp);
4037				hat_setmod(pp);
4038				page_io_unlock(pp);
4039				page_unlock(pp);
4040			}
4041			rp->r_flags |= RDIRTY;
4042			mutex_exit(&rp->r_statelock);
4043			if (offp)
4044				*offp = io_off;
4045			if (lenp)
4046				*lenp = io_len;
4047			return (0);
4048		}
4049		mutex_exit(&rp->r_statelock);
4050	}
4051
4052	if (flags & B_ASYNC) {
4053		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4054		    nfs_sync_putapage);
4055	} else
4056		error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4057
4058	if (offp)
4059		*offp = io_off;
4060	if (lenp)
4061		*lenp = io_len;
4062	return (error);
4063}
4064
4065static int
4066nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4067	int flags, cred_t *cr)
4068{
4069	int error;
4070	rnode_t *rp;
4071
4072	flags |= B_WRITE;
4073
4074	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4075	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4076
4077	rp = VTOR(vp);
4078
4079	if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4080	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4081		if (!(rp->r_flags & ROUTOFSPACE)) {
4082			mutex_enter(&rp->r_statelock);
4083			rp->r_flags |= ROUTOFSPACE;
4084			mutex_exit(&rp->r_statelock);
4085		}
4086		flags |= B_ERROR;
4087		pvn_write_done(pp, flags);
4088		/*
4089		 * If this was not an async thread, then try again to
4090		 * write out the pages, but this time, also destroy
4091		 * them whether or not the write is successful.  This
4092		 * will prevent memory from filling up with these
4093		 * pages and destroying them is the only alternative
4094		 * if they can't be written out.
4095		 *
4096		 * Don't do this if this is an async thread because
4097		 * when the pages are unlocked in pvn_write_done,
4098		 * some other thread could have come along, locked
4099		 * them, and queued for an async thread.  It would be
4100		 * possible for all of the async threads to be tied
4101		 * up waiting to lock the pages again and they would
4102		 * all already be locked and waiting for an async
4103		 * thread to handle them.  Deadlock.
4104		 */
4105		if (!(flags & B_ASYNC)) {
4106			error = nfs_putpage(vp, io_off, io_len,
4107			    B_INVAL | B_FORCE, cr);
4108		}
4109	} else {
4110		if (error)
4111			flags |= B_ERROR;
4112		else if (rp->r_flags & ROUTOFSPACE) {
4113			mutex_enter(&rp->r_statelock);
4114			rp->r_flags &= ~ROUTOFSPACE;
4115			mutex_exit(&rp->r_statelock);
4116		}
4117		pvn_write_done(pp, flags);
4118	}
4119
4120	return (error);
4121}
4122
4123static int
4124nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4125	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
4126{
4127	struct segvn_crargs vn_a;
4128	int error;
4129	rnode_t *rp;
4130	struct vattr va;
4131
4132	if (nfs_zone() != VTOMI(vp)->mi_zone)
4133		return (EIO);
4134
4135	if (vp->v_flag & VNOMAP)
4136		return (ENOSYS);
4137
4138	if (off > MAXOFF32_T)
4139		return (EFBIG);
4140
4141	if (off < 0 || off + len < 0)
4142		return (ENXIO);
4143
4144	if (vp->v_type != VREG)
4145		return (ENODEV);
4146
4147	/*
4148	 * If there is cached data and if close-to-open consistency
4149	 * checking is not turned off and if the file system is not
4150	 * mounted readonly, then force an over the wire getattr.
4151	 * Otherwise, just invoke nfsgetattr to get a copy of the
4152	 * attributes.  The attribute cache will be used unless it
4153	 * is timed out and if it is, then an over the wire getattr
4154	 * will be issued.
4155	 */
4156	va.va_mask = AT_ALL;
4157	if (vn_has_cached_data(vp) &&
4158	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4159		error = nfs_getattr_otw(vp, &va, cr);
4160	else
4161		error = nfsgetattr(vp, &va, cr);
4162	if (error)
4163		return (error);
4164
4165	/*
4166	 * Check to see if the vnode is currently marked as not cachable.
4167	 * This means portions of the file are locked (through VOP_FRLOCK).
4168	 * In this case the map request must be refused.  We use
4169	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
4170	 */
4171	rp = VTOR(vp);
4172	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
4173		return (EINTR);
4174
4175	if (vp->v_flag & VNOCACHE) {
4176		error = EAGAIN;
4177		goto done;
4178	}
4179
4180	/*
4181	 * Don't allow concurrent locks and mapping if mandatory locking is
4182	 * enabled.
4183	 */
4184	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4185	    MANDLOCK(vp, va.va_mode)) {
4186		error = EAGAIN;
4187		goto done;
4188	}
4189
4190	as_rangelock(as);
4191	if (!(flags & MAP_FIXED)) {
4192		map_addr(addrp, len, off, 1, flags);
4193		if (*addrp == NULL) {
4194			as_rangeunlock(as);
4195			error = ENOMEM;
4196			goto done;
4197		}
4198	} else {
4199		/*
4200		 * User specified address - blow away any previous mappings
4201		 */
4202		(void) as_unmap(as, *addrp, len);
4203	}
4204
4205	vn_a.vp = vp;
4206	vn_a.offset = off;
4207	vn_a.type = (flags & MAP_TYPE);
4208	vn_a.prot = (uchar_t)prot;
4209	vn_a.maxprot = (uchar_t)maxprot;
4210	vn_a.flags = (flags & ~MAP_TYPE);
4211	vn_a.cred = cr;
4212	vn_a.amp = NULL;
4213	vn_a.szc = 0;
4214	vn_a.lgrp_mem_policy_flags = 0;
4215
4216	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4217	as_rangeunlock(as);
4218
4219done:
4220	nfs_rw_exit(&rp->r_lkserlock);
4221	return (error);
4222}
4223
4224/* ARGSUSED */
4225static int
4226nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4227	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
4228{
4229	rnode_t *rp;
4230
4231	if (vp->v_flag & VNOMAP)
4232		return (ENOSYS);
4233	if (nfs_zone() != VTOMI(vp)->mi_zone)
4234		return (EIO);
4235
4236	/*
4237	 * Need to hold rwlock while incrementing the mapcnt so that
4238	 * mmap'ing can be serialized with writes so that the caching
4239	 * can be handled correctly.
4240	 */
4241	rp = VTOR(vp);
4242	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4243		return (EINTR);
4244	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4245	nfs_rw_exit(&rp->r_rwlock);
4246
4247	return (0);
4248}
4249
4250static int
4251nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4252	offset_t offset, struct flk_callback *flk_cbp, cred_t *cr)
4253{
4254	netobj lm_fh;
4255	int rc;
4256	u_offset_t start, end;
4257	rnode_t *rp;
4258	int error = 0, intr = INTR(vp);
4259
4260	/* check for valid cmd parameter */
4261	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4262		return (EINVAL);
4263	if (nfs_zone() != VTOMI(vp)->mi_zone)
4264		return (EIO);
4265
4266	/* Verify l_type. */
4267	switch (bfp->l_type) {
4268	case F_RDLCK:
4269		if (cmd != F_GETLK && !(flag & FREAD))
4270			return (EBADF);
4271		break;
4272	case F_WRLCK:
4273		if (cmd != F_GETLK && !(flag & FWRITE))
4274			return (EBADF);
4275		break;
4276	case F_UNLCK:
4277		intr = 0;
4278		break;
4279
4280	default:
4281		return (EINVAL);
4282	}
4283
4284	/* check the validity of the lock range */
4285	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4286		return (rc);
4287	if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4288		return (rc);
4289
4290	/*
4291	 * If the filesystem is mounted using local locking, pass the
4292	 * request off to the local locking code.
4293	 */
4294	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4295		if (offset > MAXOFF32_T)
4296			return (EFBIG);
4297		if (cmd == F_SETLK || cmd == F_SETLKW) {
4298			/*
4299			 * For complete safety, we should be holding
4300			 * r_lkserlock.  However, we can't call
4301			 * lm_safelock and then fs_frlock while
4302			 * holding r_lkserlock, so just invoke
4303			 * lm_safelock and expect that this will
4304			 * catch enough of the cases.
4305			 */
4306			if (!lm_safelock(vp, bfp, cr))
4307				return (EAGAIN);
4308		}
4309		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
4310	}
4311
4312	rp = VTOR(vp);
4313
4314	/*
4315	 * Check whether the given lock request can proceed, given the
4316	 * current file mappings.
4317	 */
4318	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4319		return (EINTR);
4320	if (cmd == F_SETLK || cmd == F_SETLKW) {
4321		if (!lm_safelock(vp, bfp, cr)) {
4322			rc = EAGAIN;
4323			goto done;
4324		}
4325	}
4326
4327	/*
4328	 * Flush the cache after waiting for async I/O to finish.  For new
4329	 * locks, this is so that the process gets the latest bits from the
4330	 * server.  For unlocks, this is so that other clients see the
4331	 * latest bits once the file has been unlocked.  If currently dirty
4332	 * pages can't be flushed, then don't allow a lock to be set.  But
4333	 * allow unlocks to succeed, to avoid having orphan locks on the
4334	 * server.
4335	 */
4336	if (cmd != F_GETLK) {
4337		mutex_enter(&rp->r_statelock);
4338		while (rp->r_count > 0) {
4339		    if (intr) {
4340			klwp_t *lwp = ttolwp(curthread);
4341
4342			if (lwp != NULL)
4343				lwp->lwp_nostop++;
4344			if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) {
4345				if (lwp != NULL)
4346					lwp->lwp_nostop--;
4347				rc = EINTR;
4348				break;
4349			}
4350			if (lwp != NULL)
4351				lwp->lwp_nostop--;
4352		    } else
4353			cv_wait(&rp->r_cv, &rp->r_statelock);
4354		}
4355		mutex_exit(&rp->r_statelock);
4356		if (rc != 0)
4357			goto done;
4358		error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr);
4359		if (error) {
4360			if (error == ENOSPC || error == EDQUOT) {
4361				mutex_enter(&rp->r_statelock);
4362				if (!rp->r_error)
4363					rp->r_error = error;
4364				mutex_exit(&rp->r_statelock);
4365			}
4366			if (bfp->l_type != F_UNLCK) {
4367				rc = ENOLCK;
4368				goto done;
4369			}
4370		}
4371	}
4372
4373	lm_fh.n_len = sizeof (fhandle_t);
4374	lm_fh.n_bytes = (char *)VTOFH(vp);
4375
4376	/*
4377	 * Call the lock manager to do the real work of contacting
4378	 * the server and obtaining the lock.
4379	 */
4380	rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4381
4382	if (rc == 0)
4383		nfs_lockcompletion(vp, cmd);
4384
4385done:
4386	nfs_rw_exit(&rp->r_lkserlock);
4387	return (rc);
4388}
4389
4390/*
4391 * Free storage space associated with the specified vnode.  The portion
4392 * to be freed is specified by bfp->l_start and bfp->l_len (already
4393 * normalized to a "whence" of 0).
4394 *
4395 * This is an experimental facility whose continued existence is not
4396 * guaranteed.  Currently, we only support the special case
4397 * of l_len == 0, meaning free to end of file.
4398 */
4399/* ARGSUSED */
4400static int
4401nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4402	offset_t offset, cred_t *cr, caller_context_t *ct)
4403{
4404	int error;
4405
4406	ASSERT(vp->v_type == VREG);
4407	if (cmd != F_FREESP)
4408		return (EINVAL);
4409
4410	if (offset > MAXOFF32_T)
4411		return (EFBIG);
4412
4413	if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4414	    (bfp->l_len > MAXOFF32_T))
4415		return (EFBIG);
4416
4417	if (nfs_zone() != VTOMI(vp)->mi_zone)
4418		return (EIO);
4419
4420	error = convoff(vp, bfp, 0, offset);
4421	if (!error) {
4422		ASSERT(bfp->l_start >= 0);
4423		if (bfp->l_len == 0) {
4424			struct vattr va;
4425
4426			/*
4427			 * ftruncate should not change the ctime and
4428			 * mtime if we truncate the file to its
4429			 * previous size.
4430			 */
4431			va.va_mask = AT_SIZE;
4432			error = nfsgetattr(vp, &va, cr);
4433			if (error || va.va_size == bfp->l_start)
4434				return (error);
4435			va.va_mask = AT_SIZE;
4436			va.va_size = bfp->l_start;
4437			error = nfssetattr(vp, &va, 0, cr);
4438		} else
4439			error = EINVAL;
4440	}
4441
4442	return (error);
4443}
4444
4445/* ARGSUSED */
4446static int
4447nfs_realvp(vnode_t *vp, vnode_t **vpp)
4448{
4449
4450	return (EINVAL);
4451}
4452
4453/*
4454 * Setup and add an address space callback to do the work of the delmap call.
4455 * The callback will (and must be) deleted in the actual callback function.
4456 *
4457 * This is done in order to take care of the problem that we have with holding
4458 * the address space's a_lock for a long period of time (e.g. if the NFS server
4459 * is down).  Callbacks will be executed in the address space code while the
4460 * a_lock is not held.	Holding the address space's a_lock causes things such
4461 * as ps and fork to hang because they are trying to acquire this lock as well.
4462 */
4463/* ARGSUSED */
4464static int
4465nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4466	size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
4467{
4468	int			caller_found;
4469	int			error;
4470	rnode_t			*rp;
4471	nfs_delmap_args_t	*dmapp;
4472	nfs_delmapcall_t	*delmap_call;
4473
4474	if (vp->v_flag & VNOMAP)
4475		return (ENOSYS);
4476	/*
4477	 * A process may not change zones if it has NFS pages mmap'ed
4478	 * in, so we can't legitimately get here from the wrong zone.
4479	 */
4480	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4481
4482	rp = VTOR(vp);
4483
4484	/*
4485	 * The way that the address space of this process deletes its mapping
4486	 * of this file is via the following call chains:
4487	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4488	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4489	 *
4490	 * With the use of address space callbacks we are allowed to drop the
4491	 * address space lock, a_lock, while executing the NFS operations that
4492	 * need to go over the wire.  Returning EAGAIN to the caller of this
4493	 * function is what drives the execution of the callback that we add
4494	 * below.  The callback will be executed by the address space code
4495	 * after dropping the a_lock.  When the callback is finished, since
4496	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4497	 * is called again on the same segment to finish the rest of the work
4498	 * that needs to happen during unmapping.
4499	 *
4500	 * This action of calling back into the segment driver causes
4501	 * nfs_delmap() to get called again, but since the callback was
4502	 * already executed at this point, it already did the work and there
4503	 * is nothing left for us to do.
4504	 *
4505	 * To Summarize:
4506	 * - The first time nfs_delmap is called by the current thread is when
4507	 * we add the caller associated with this delmap to the delmap caller
4508	 * list, add the callback, and return EAGAIN.
4509	 * - The second time in this call chain when nfs_delmap is called we
4510	 * will find this caller in the delmap caller list and realize there
4511	 * is no more work to do thus removing this caller from the list and
4512	 * returning the error that was set in the callback execution.
4513	 */
4514	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4515	if (caller_found) {
4516		/*
4517		 * 'error' is from the actual delmap operations.  To avoid
4518		 * hangs, we need to handle the return of EAGAIN differently
4519		 * since this is what drives the callback execution.
4520		 * In this case, we don't want to return EAGAIN and do the
4521		 * callback execution because there are none to execute.
4522		 */
4523		if (error == EAGAIN)
4524			return (0);
4525		else
4526			return (error);
4527	}
4528
4529	/* current caller was not in the list */
4530	delmap_call = nfs_init_delmapcall();
4531
4532	mutex_enter(&rp->r_statelock);
4533	list_insert_tail(&rp->r_indelmap, delmap_call);
4534	mutex_exit(&rp->r_statelock);
4535
4536	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4537
4538	dmapp->vp = vp;
4539	dmapp->off = off;
4540	dmapp->addr = addr;
4541	dmapp->len = len;
4542	dmapp->prot = prot;
4543	dmapp->maxprot = maxprot;
4544	dmapp->flags = flags;
4545	dmapp->cr = cr;
4546	dmapp->caller = delmap_call;
4547
4548	error = as_add_callback(as, nfs_delmap_callback, dmapp,
4549	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4550
4551	return (error ? error : EAGAIN);
4552}
4553
4554/*
4555 * Remove some pages from an mmap'd vnode.  Just update the
4556 * count of pages.  If doing close-to-open, then flush all
4557 * of the pages associated with this file.  Otherwise, start
4558 * an asynchronous page flush to write out any dirty pages.
4559 * This will also associate a credential with the rnode which
4560 * can be used to write the pages.
4561 */
4562/* ARGSUSED */
4563static void
4564nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4565{
4566	int			error;
4567	rnode_t			*rp;
4568	mntinfo_t		*mi;
4569	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
4570
4571	rp = VTOR(dmapp->vp);
4572	mi = VTOMI(dmapp->vp);
4573
4574	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4575	ASSERT(rp->r_mapcnt >= 0);
4576
4577	/*
4578	 * Initiate a page flush if there are pages, the file system
4579	 * was not mounted readonly, the segment was mapped shared, and
4580	 * the pages themselves were writeable.
4581	 */
4582	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4583	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4584		mutex_enter(&rp->r_statelock);
4585		rp->r_flags |= RDIRTY;
4586		mutex_exit(&rp->r_statelock);
4587		/*
4588		 * If this is a cross-zone access a sync putpage won't work, so
4589		 * the best we can do is try an async putpage.  That seems
4590		 * better than something more draconian such as discarding the
4591		 * dirty pages.
4592		 */
4593		if ((mi->mi_flags & MI_NOCTO) ||
4594		    nfs_zone() != mi->mi_zone)
4595			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4596			    B_ASYNC, dmapp->cr);
4597		else
4598			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4599			    0, dmapp->cr);
4600		if (!error) {
4601			mutex_enter(&rp->r_statelock);
4602			error = rp->r_error;
4603			rp->r_error = 0;
4604			mutex_exit(&rp->r_statelock);
4605		}
4606	} else
4607		error = 0;
4608
4609	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4610		(void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4611		    B_INVAL, dmapp->cr);
4612
4613	dmapp->caller->error = error;
4614	(void) as_delete_callback(as, arg);
4615	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4616}
4617
4618/* ARGSUSED */
4619static int
4620nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
4621{
4622	int error = 0;
4623
4624	if (nfs_zone() != VTOMI(vp)->mi_zone)
4625		return (EIO);
4626	/*
4627	 * This looks a little weird because it's written in a general
4628	 * manner but we make little use of cases.  If cntl() ever gets
4629	 * widely used, the outer switch will make more sense.
4630	 */
4631
4632	switch (cmd) {
4633
4634	/*
4635	 * Large file spec - need to base answer new query with
4636	 * hardcoded constant based on the protocol.
4637	 */
4638	case _PC_FILESIZEBITS:
4639		*valp = 32;
4640		return (0);
4641
4642	case _PC_LINK_MAX:
4643	case _PC_NAME_MAX:
4644	case _PC_PATH_MAX:
4645	case _PC_SYMLINK_MAX:
4646	case _PC_CHOWN_RESTRICTED:
4647	case _PC_NO_TRUNC: {
4648		mntinfo_t *mi;
4649		struct pathcnf *pc;
4650
4651		if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4652			return (EINVAL);
4653		error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4654		switch (cmd) {
4655		case _PC_LINK_MAX:
4656			*valp = pc->pc_link_max;
4657			break;
4658		case _PC_NAME_MAX:
4659			*valp = pc->pc_name_max;
4660			break;
4661		case _PC_PATH_MAX:
4662		case _PC_SYMLINK_MAX:
4663			*valp = pc->pc_path_max;
4664			break;
4665		case _PC_CHOWN_RESTRICTED:
4666			/*
4667			 * if we got here, error is really a boolean which
4668			 * indicates whether cmd is set or not.
4669			 */
4670			*valp = error ? 1 : 0;	/* see above */
4671			error = 0;
4672			break;
4673		case _PC_NO_TRUNC:
4674			/*
4675			 * if we got here, error is really a boolean which
4676			 * indicates whether cmd is set or not.
4677			 */
4678			*valp = error ? 1 : 0;	/* see above */
4679			error = 0;
4680			break;
4681		}
4682		return (error ? EINVAL : 0);
4683	    }
4684
4685	case _PC_XATTR_EXISTS:
4686		*valp = 0;
4687		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4688			vnode_t *avp;
4689			rnode_t *rp;
4690			mntinfo_t *mi = VTOMI(vp);
4691
4692			if (!(mi->mi_flags & MI_EXTATTR))
4693				return (0);
4694
4695			rp = VTOR(vp);
4696			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4697			    INTR(vp)))
4698				return (EINTR);
4699
4700			error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4701			if (error || avp == NULL)
4702				error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4703
4704			nfs_rw_exit(&rp->r_rwlock);
4705
4706			if (error == 0 && avp != NULL) {
4707				VN_RELE(avp);
4708				*valp = 1;
4709			}
4710		}
4711		return (error ? EINVAL : 0);
4712
4713	case _PC_ACL_ENABLED:
4714		*valp = _ACL_ACLENT_ENABLED;
4715		return (0);
4716
4717	default:
4718		return (EINVAL);
4719	}
4720}
4721
4722/*
4723 * Called by async thread to do synchronous pageio. Do the i/o, wait
4724 * for it to complete, and cleanup the page list when done.
4725 */
4726static int
4727nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4728	int flags, cred_t *cr)
4729{
4730	int error;
4731
4732	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4733	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4734	if (flags & B_READ)
4735		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4736	else
4737		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4738	return (error);
4739}
4740
4741static int
4742nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4743	int flags, cred_t *cr)
4744{
4745	int error;
4746	rnode_t *rp;
4747
4748	if (pp == NULL)
4749		return (EINVAL);
4750
4751	if (io_off > MAXOFF32_T)
4752		return (EFBIG);
4753	if (nfs_zone() != VTOMI(vp)->mi_zone)
4754		return (EIO);
4755	rp = VTOR(vp);
4756	mutex_enter(&rp->r_statelock);
4757	rp->r_count++;
4758	mutex_exit(&rp->r_statelock);
4759
4760	if (flags & B_ASYNC) {
4761		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4762		    nfs_sync_pageio);
4763	} else
4764		error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4765	mutex_enter(&rp->r_statelock);
4766	rp->r_count--;
4767	cv_broadcast(&rp->r_cv);
4768	mutex_exit(&rp->r_statelock);
4769	return (error);
4770}
4771
4772static int
4773nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
4774{
4775	int error;
4776	mntinfo_t *mi;
4777
4778	mi = VTOMI(vp);
4779
4780	if (nfs_zone() != mi->mi_zone)
4781		return (EIO);
4782	if (mi->mi_flags & MI_ACL) {
4783		error = acl_setacl2(vp, vsecattr, flag, cr);
4784		if (mi->mi_flags & MI_ACL)
4785			return (error);
4786	}
4787
4788	return (ENOSYS);
4789}
4790
4791static int
4792nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
4793{
4794	int error;
4795	mntinfo_t *mi;
4796
4797	mi = VTOMI(vp);
4798
4799	if (nfs_zone() != mi->mi_zone)
4800		return (EIO);
4801	if (mi->mi_flags & MI_ACL) {
4802		error = acl_getacl2(vp, vsecattr, flag, cr);
4803		if (mi->mi_flags & MI_ACL)
4804			return (error);
4805	}
4806
4807	return (fs_fab_acl(vp, vsecattr, flag, cr));
4808}
4809
4810static int
4811nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr)
4812{
4813	int error;
4814	struct shrlock nshr;
4815	struct nfs_owner nfs_owner;
4816	netobj lm_fh;
4817
4818	if (nfs_zone() != VTOMI(vp)->mi_zone)
4819		return (EIO);
4820
4821	/*
4822	 * check for valid cmd parameter
4823	 */
4824	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
4825		return (EINVAL);
4826
4827	/*
4828	 * Check access permissions
4829	 */
4830	if (cmd == F_SHARE &&
4831	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
4832	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
4833		return (EBADF);
4834
4835	/*
4836	 * If the filesystem is mounted using local locking, pass the
4837	 * request off to the local share code.
4838	 */
4839	if (VTOMI(vp)->mi_flags & MI_LLOCK)
4840		return (fs_shrlock(vp, cmd, shr, flag, cr));
4841
4842	switch (cmd) {
4843	case F_SHARE:
4844	case F_UNSHARE:
4845		lm_fh.n_len = sizeof (fhandle_t);
4846		lm_fh.n_bytes = (char *)VTOFH(vp);
4847
4848		/*
4849		 * If passed an owner that is too large to fit in an
4850		 * nfs_owner it is likely a recursive call from the
4851		 * lock manager client and pass it straight through.  If
4852		 * it is not a nfs_owner then simply return an error.
4853		 */
4854		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
4855			if (((struct nfs_owner *)shr->s_owner)->magic !=
4856			    NFS_OWNER_MAGIC)
4857				return (EINVAL);
4858
4859			if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
4860				error = set_errno(error);
4861			}
4862			return (error);
4863		}
4864		/*
4865		 * Remote share reservations owner is a combination of
4866		 * a magic number, hostname, and the local owner
4867		 */
4868		bzero(&nfs_owner, sizeof (nfs_owner));
4869		nfs_owner.magic = NFS_OWNER_MAGIC;
4870		(void) strncpy(nfs_owner.hname, uts_nodename(),
4871		    sizeof (nfs_owner.hname));
4872		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
4873		nshr.s_access = shr->s_access;
4874		nshr.s_deny = shr->s_deny;
4875		nshr.s_sysid = 0;
4876		nshr.s_pid = ttoproc(curthread)->p_pid;
4877		nshr.s_own_len = sizeof (nfs_owner);
4878		nshr.s_owner = (caddr_t)&nfs_owner;
4879
4880		if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
4881			error = set_errno(error);
4882		}
4883
4884		break;
4885
4886	case F_HASREMOTELOCKS:
4887		/*
4888		 * NFS client can't store remote locks itself
4889		 */
4890		shr->s_access = 0;
4891		error = 0;
4892		break;
4893
4894	default:
4895		error = EINVAL;
4896		break;
4897	}
4898
4899	return (error);
4900}
4901