1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD$");
36
37#include <sys/capability.h>
38
39/*
40 * Functions that perform the vfs operations required by the routines in
41 * nfsd_serv.c. It is hoped that this change will make the server more
42 * portable.
43 */
44
45#include <fs/nfs/nfsport.h>
46#include <sys/hash.h>
47#include <sys/sysctl.h>
48#include <nlm/nlm_prot.h>
49#include <nlm/nlm.h>
50
51FEATURE(nfsd, "NFSv4 server");
52
53extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
54extern int nfsrv_useacl;
55extern int newnfs_numnfsd;
56extern struct mount nfsv4root_mnt;
57extern struct nfsrv_stablefirst nfsrv_stablefirst;
58extern void (*nfsd_call_servertimer)(void);
59extern SVCPOOL	*nfsrvd_pool;
60extern struct nfsv4lock nfsd_suspend_lock;
61struct vfsoptlist nfsv4root_opt, nfsv4root_newopt;
62NFSDLOCKMUTEX;
63struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
64struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
65struct mtx nfsrc_udpmtx;
66struct mtx nfs_v4root_mutex;
67struct nfsrvfh nfs_rootfh, nfs_pubfh;
68int nfs_pubfhset = 0, nfs_rootfhset = 0;
69struct proc *nfsd_master_proc = NULL;
70static pid_t nfsd_master_pid = (pid_t)-1;
71static char nfsd_master_comm[MAXCOMLEN + 1];
72static struct timeval nfsd_master_start;
73static uint32_t nfsv4_sysid = 0;
74
75static int nfssvc_srvcall(struct thread *, struct nfssvc_args *,
76    struct ucred *);
77
78int nfsrv_enable_crossmntpt = 1;
79static int nfs_commit_blks;
80static int nfs_commit_miss;
81extern int nfsrv_issuedelegs;
82extern int nfsrv_dolocallocks;
83extern int nfsd_enable_stringtouid;
84
85SYSCTL_NODE(_vfs, OID_AUTO, nfsd, CTLFLAG_RW, 0, "New NFS server");
86SYSCTL_INT(_vfs_nfsd, OID_AUTO, mirrormnt, CTLFLAG_RW,
87    &nfsrv_enable_crossmntpt, 0, "Enable nfsd to cross mount points");
88SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks,
89    0, "");
90SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss,
91    0, "");
92SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW,
93    &nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations");
94SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW,
95    &nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files");
96SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_stringtouid, CTLFLAG_RW,
97    &nfsd_enable_stringtouid, 0, "Enable nfsd to accept numeric owner_names");
98
99#define	MAX_REORDERED_RPC	16
100#define	NUM_HEURISTIC		1031
101#define	NHUSE_INIT		64
102#define	NHUSE_INC		16
103#define	NHUSE_MAX		2048
104
105static struct nfsheur {
106	struct vnode *nh_vp;	/* vp to match (unreferenced pointer) */
107	off_t nh_nextoff;	/* next offset for sequential detection */
108	int nh_use;		/* use count for selection */
109	int nh_seqcount;	/* heuristic */
110} nfsheur[NUM_HEURISTIC];
111
112
113/*
114 * Heuristic to detect sequential operation.
115 */
116static struct nfsheur *
117nfsrv_sequential_heuristic(struct uio *uio, struct vnode *vp)
118{
119	struct nfsheur *nh;
120	int hi, try;
121
122	/* Locate best candidate. */
123	try = 32;
124	hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
125	nh = &nfsheur[hi];
126	while (try--) {
127		if (nfsheur[hi].nh_vp == vp) {
128			nh = &nfsheur[hi];
129			break;
130		}
131		if (nfsheur[hi].nh_use > 0)
132			--nfsheur[hi].nh_use;
133		hi = (hi + 1) % NUM_HEURISTIC;
134		if (nfsheur[hi].nh_use < nh->nh_use)
135			nh = &nfsheur[hi];
136	}
137
138	/* Initialize hint if this is a new file. */
139	if (nh->nh_vp != vp) {
140		nh->nh_vp = vp;
141		nh->nh_nextoff = uio->uio_offset;
142		nh->nh_use = NHUSE_INIT;
143		if (uio->uio_offset == 0)
144			nh->nh_seqcount = 4;
145		else
146			nh->nh_seqcount = 1;
147	}
148
149	/* Calculate heuristic. */
150	if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) ||
151	    uio->uio_offset == nh->nh_nextoff) {
152		/* See comments in vfs_vnops.c:sequential_heuristic(). */
153		nh->nh_seqcount += howmany(uio->uio_resid, 16384);
154		if (nh->nh_seqcount > IO_SEQMAX)
155			nh->nh_seqcount = IO_SEQMAX;
156	} else if (qabs(uio->uio_offset - nh->nh_nextoff) <= MAX_REORDERED_RPC *
157	    imax(vp->v_mount->mnt_stat.f_iosize, uio->uio_resid)) {
158		/* Probably a reordered RPC, leave seqcount alone. */
159	} else if (nh->nh_seqcount > 1) {
160		nh->nh_seqcount /= 2;
161	} else {
162		nh->nh_seqcount = 0;
163	}
164	nh->nh_use += NHUSE_INC;
165	if (nh->nh_use > NHUSE_MAX)
166		nh->nh_use = NHUSE_MAX;
167	return (nh);
168}
169
170/*
171 * Get attributes into nfsvattr structure.
172 */
173int
174nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred,
175    struct thread *p, int vpislocked)
176{
177	int error, lockedit = 0;
178
179	if (vpislocked == 0) {
180		/*
181		 * When vpislocked == 0, the vnode is either exclusively
182		 * locked by this thread or not locked by this thread.
183		 * As such, shared lock it, if not exclusively locked.
184		 */
185		if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
186			lockedit = 1;
187			NFSVOPLOCK(vp, LK_SHARED | LK_RETRY);
188		}
189	}
190	error = VOP_GETATTR(vp, &nvap->na_vattr, cred);
191	if (lockedit != 0)
192		NFSVOPUNLOCK(vp, 0);
193
194	NFSEXITCODE(error);
195	return (error);
196}
197
198/*
199 * Get a file handle for a vnode.
200 */
201int
202nfsvno_getfh(struct vnode *vp, fhandle_t *fhp, struct thread *p)
203{
204	int error;
205
206	NFSBZERO((caddr_t)fhp, sizeof(fhandle_t));
207	fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid;
208	error = VOP_VPTOFH(vp, &fhp->fh_fid);
209
210	NFSEXITCODE(error);
211	return (error);
212}
213
214/*
215 * Perform access checking for vnodes obtained from file handles that would
216 * refer to files already opened by a Unix client. You cannot just use
217 * vn_writechk() and VOP_ACCESSX() for two reasons.
218 * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write
219 *     case.
220 * 2 - The owner is to be given access irrespective of mode bits for some
221 *     operations, so that processes that chmod after opening a file don't
222 *     break.
223 */
224int
225nfsvno_accchk(struct vnode *vp, accmode_t accmode, struct ucred *cred,
226    struct nfsexstuff *exp, struct thread *p, int override, int vpislocked,
227    u_int32_t *supportedtypep)
228{
229	struct vattr vattr;
230	int error = 0, getret = 0;
231
232	if (vpislocked == 0) {
233		if (NFSVOPLOCK(vp, LK_SHARED) != 0) {
234			error = EPERM;
235			goto out;
236		}
237	}
238	if (accmode & VWRITE) {
239		/* Just vn_writechk() changed to check rdonly */
240		/*
241		 * Disallow write attempts on read-only file systems;
242		 * unless the file is a socket or a block or character
243		 * device resident on the file system.
244		 */
245		if (NFSVNO_EXRDONLY(exp) ||
246		    (vp->v_mount->mnt_flag & MNT_RDONLY)) {
247			switch (vp->v_type) {
248			case VREG:
249			case VDIR:
250			case VLNK:
251				error = EROFS;
252			default:
253				break;
254			}
255		}
256		/*
257		 * If there's shared text associated with
258		 * the inode, try to free it up once.  If
259		 * we fail, we can't allow writing.
260		 */
261		if (VOP_IS_TEXT(vp) && error == 0)
262			error = ETXTBSY;
263	}
264	if (error != 0) {
265		if (vpislocked == 0)
266			NFSVOPUNLOCK(vp, 0);
267		goto out;
268	}
269
270	/*
271	 * Should the override still be applied when ACLs are enabled?
272	 */
273	error = VOP_ACCESSX(vp, accmode, cred, p);
274	if (error != 0 && (accmode & (VDELETE | VDELETE_CHILD))) {
275		/*
276		 * Try again with VEXPLICIT_DENY, to see if the test for
277		 * deletion is supported.
278		 */
279		error = VOP_ACCESSX(vp, accmode | VEXPLICIT_DENY, cred, p);
280		if (error == 0) {
281			if (vp->v_type == VDIR) {
282				accmode &= ~(VDELETE | VDELETE_CHILD);
283				accmode |= VWRITE;
284				error = VOP_ACCESSX(vp, accmode, cred, p);
285			} else if (supportedtypep != NULL) {
286				*supportedtypep &= ~NFSACCESS_DELETE;
287			}
288		}
289	}
290
291	/*
292	 * Allow certain operations for the owner (reads and writes
293	 * on files that are already open).
294	 */
295	if (override != NFSACCCHK_NOOVERRIDE &&
296	    (error == EPERM || error == EACCES)) {
297		if (cred->cr_uid == 0 && (override & NFSACCCHK_ALLOWROOT))
298			error = 0;
299		else if (override & NFSACCCHK_ALLOWOWNER) {
300			getret = VOP_GETATTR(vp, &vattr, cred);
301			if (getret == 0 && cred->cr_uid == vattr.va_uid)
302				error = 0;
303		}
304	}
305	if (vpislocked == 0)
306		NFSVOPUNLOCK(vp, 0);
307
308out:
309	NFSEXITCODE(error);
310	return (error);
311}
312
313/*
314 * Set attribute(s) vnop.
315 */
316int
317nfsvno_setattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred,
318    struct thread *p, struct nfsexstuff *exp)
319{
320	int error;
321
322	error = VOP_SETATTR(vp, &nvap->na_vattr, cred);
323	NFSEXITCODE(error);
324	return (error);
325}
326
327/*
328 * Set up nameidata for a lookup() call and do it.
329 */
330int
331nfsvno_namei(struct nfsrv_descript *nd, struct nameidata *ndp,
332    struct vnode *dp, int islocked, struct nfsexstuff *exp, struct thread *p,
333    struct vnode **retdirp)
334{
335	struct componentname *cnp = &ndp->ni_cnd;
336	int i;
337	struct iovec aiov;
338	struct uio auio;
339	int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0, linklen;
340	int error = 0, crossmnt;
341	char *cp;
342
343	*retdirp = NULL;
344	cnp->cn_nameptr = cnp->cn_pnbuf;
345	ndp->ni_strictrelative = 0;
346	/*
347	 * Extract and set starting directory.
348	 */
349	if (dp->v_type != VDIR) {
350		if (islocked)
351			vput(dp);
352		else
353			vrele(dp);
354		nfsvno_relpathbuf(ndp);
355		error = ENOTDIR;
356		goto out1;
357	}
358	if (islocked)
359		NFSVOPUNLOCK(dp, 0);
360	VREF(dp);
361	*retdirp = dp;
362	if (NFSVNO_EXRDONLY(exp))
363		cnp->cn_flags |= RDONLY;
364	ndp->ni_segflg = UIO_SYSSPACE;
365	crossmnt = 1;
366
367	if (nd->nd_flag & ND_PUBLOOKUP) {
368		ndp->ni_loopcnt = 0;
369		if (cnp->cn_pnbuf[0] == '/') {
370			vrele(dp);
371			/*
372			 * Check for degenerate pathnames here, since lookup()
373			 * panics on them.
374			 */
375			for (i = 1; i < ndp->ni_pathlen; i++)
376				if (cnp->cn_pnbuf[i] != '/')
377					break;
378			if (i == ndp->ni_pathlen) {
379				error = NFSERR_ACCES;
380				goto out;
381			}
382			dp = rootvnode;
383			VREF(dp);
384		}
385	} else if ((nfsrv_enable_crossmntpt == 0 && NFSVNO_EXPORTED(exp)) ||
386	    (nd->nd_flag & ND_NFSV4) == 0) {
387		/*
388		 * Only cross mount points for NFSv4 when doing a
389		 * mount while traversing the file system above
390		 * the mount point, unless nfsrv_enable_crossmntpt is set.
391		 */
392		cnp->cn_flags |= NOCROSSMOUNT;
393		crossmnt = 0;
394	}
395
396	/*
397	 * Initialize for scan, set ni_startdir and bump ref on dp again
398	 * becuase lookup() will dereference ni_startdir.
399	 */
400
401	cnp->cn_thread = p;
402	ndp->ni_startdir = dp;
403	ndp->ni_rootdir = rootvnode;
404	ndp->ni_topdir = NULL;
405
406	if (!lockleaf)
407		cnp->cn_flags |= LOCKLEAF;
408	for (;;) {
409		cnp->cn_nameptr = cnp->cn_pnbuf;
410		/*
411		 * Call lookup() to do the real work.  If an error occurs,
412		 * ndp->ni_vp and ni_dvp are left uninitialized or NULL and
413		 * we do not have to dereference anything before returning.
414		 * In either case ni_startdir will be dereferenced and NULLed
415		 * out.
416		 */
417		error = lookup(ndp);
418		if (error)
419			break;
420
421		/*
422		 * Check for encountering a symbolic link.  Trivial
423		 * termination occurs if no symlink encountered.
424		 */
425		if ((cnp->cn_flags & ISSYMLINK) == 0) {
426			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
427				nfsvno_relpathbuf(ndp);
428			if (ndp->ni_vp && !lockleaf)
429				NFSVOPUNLOCK(ndp->ni_vp, 0);
430			break;
431		}
432
433		/*
434		 * Validate symlink
435		 */
436		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
437			NFSVOPUNLOCK(ndp->ni_dvp, 0);
438		if (!(nd->nd_flag & ND_PUBLOOKUP)) {
439			error = EINVAL;
440			goto badlink2;
441		}
442
443		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
444			error = ELOOP;
445			goto badlink2;
446		}
447		if (ndp->ni_pathlen > 1)
448			cp = uma_zalloc(namei_zone, M_WAITOK);
449		else
450			cp = cnp->cn_pnbuf;
451		aiov.iov_base = cp;
452		aiov.iov_len = MAXPATHLEN;
453		auio.uio_iov = &aiov;
454		auio.uio_iovcnt = 1;
455		auio.uio_offset = 0;
456		auio.uio_rw = UIO_READ;
457		auio.uio_segflg = UIO_SYSSPACE;
458		auio.uio_td = NULL;
459		auio.uio_resid = MAXPATHLEN;
460		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
461		if (error) {
462		badlink1:
463			if (ndp->ni_pathlen > 1)
464				uma_zfree(namei_zone, cp);
465		badlink2:
466			vrele(ndp->ni_dvp);
467			vput(ndp->ni_vp);
468			break;
469		}
470		linklen = MAXPATHLEN - auio.uio_resid;
471		if (linklen == 0) {
472			error = ENOENT;
473			goto badlink1;
474		}
475		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
476			error = ENAMETOOLONG;
477			goto badlink1;
478		}
479
480		/*
481		 * Adjust or replace path
482		 */
483		if (ndp->ni_pathlen > 1) {
484			NFSBCOPY(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
485			uma_zfree(namei_zone, cnp->cn_pnbuf);
486			cnp->cn_pnbuf = cp;
487		} else
488			cnp->cn_pnbuf[linklen] = '\0';
489		ndp->ni_pathlen += linklen;
490
491		/*
492		 * Cleanup refs for next loop and check if root directory
493		 * should replace current directory.  Normally ni_dvp
494		 * becomes the new base directory and is cleaned up when
495		 * we loop.  Explicitly null pointers after invalidation
496		 * to clarify operation.
497		 */
498		vput(ndp->ni_vp);
499		ndp->ni_vp = NULL;
500
501		if (cnp->cn_pnbuf[0] == '/') {
502			vrele(ndp->ni_dvp);
503			ndp->ni_dvp = ndp->ni_rootdir;
504			VREF(ndp->ni_dvp);
505		}
506		ndp->ni_startdir = ndp->ni_dvp;
507		ndp->ni_dvp = NULL;
508	}
509	if (!lockleaf)
510		cnp->cn_flags &= ~LOCKLEAF;
511
512out:
513	if (error) {
514		uma_zfree(namei_zone, cnp->cn_pnbuf);
515		ndp->ni_vp = NULL;
516		ndp->ni_dvp = NULL;
517		ndp->ni_startdir = NULL;
518		cnp->cn_flags &= ~HASBUF;
519	} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) {
520		ndp->ni_dvp = NULL;
521	}
522
523out1:
524	NFSEXITCODE2(error, nd);
525	return (error);
526}
527
528/*
529 * Set up a pathname buffer and return a pointer to it and, optionally
530 * set a hash pointer.
531 */
532void
533nfsvno_setpathbuf(struct nameidata *ndp, char **bufpp, u_long **hashpp)
534{
535	struct componentname *cnp = &ndp->ni_cnd;
536
537	cnp->cn_flags |= (NOMACCHECK | HASBUF);
538	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
539	if (hashpp != NULL)
540		*hashpp = NULL;
541	*bufpp = cnp->cn_pnbuf;
542}
543
544/*
545 * Release the above path buffer, if not released by nfsvno_namei().
546 */
547void
548nfsvno_relpathbuf(struct nameidata *ndp)
549{
550
551	if ((ndp->ni_cnd.cn_flags & HASBUF) == 0)
552		panic("nfsrelpath");
553	uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
554	ndp->ni_cnd.cn_flags &= ~HASBUF;
555}
556
557/*
558 * Readlink vnode op into an mbuf list.
559 */
560int
561nfsvno_readlink(struct vnode *vp, struct ucred *cred, struct thread *p,
562    struct mbuf **mpp, struct mbuf **mpendp, int *lenp)
563{
564	struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN];
565	struct iovec *ivp = iv;
566	struct uio io, *uiop = &io;
567	struct mbuf *mp, *mp2 = NULL, *mp3 = NULL;
568	int i, len, tlen, error = 0;
569
570	len = 0;
571	i = 0;
572	while (len < NFS_MAXPATHLEN) {
573		NFSMGET(mp);
574		MCLGET(mp, M_WAIT);
575		mp->m_len = NFSMSIZ(mp);
576		if (len == 0) {
577			mp3 = mp2 = mp;
578		} else {
579			mp2->m_next = mp;
580			mp2 = mp;
581		}
582		if ((len + mp->m_len) > NFS_MAXPATHLEN) {
583			mp->m_len = NFS_MAXPATHLEN - len;
584			len = NFS_MAXPATHLEN;
585		} else {
586			len += mp->m_len;
587		}
588		ivp->iov_base = mtod(mp, caddr_t);
589		ivp->iov_len = mp->m_len;
590		i++;
591		ivp++;
592	}
593	uiop->uio_iov = iv;
594	uiop->uio_iovcnt = i;
595	uiop->uio_offset = 0;
596	uiop->uio_resid = len;
597	uiop->uio_rw = UIO_READ;
598	uiop->uio_segflg = UIO_SYSSPACE;
599	uiop->uio_td = NULL;
600	error = VOP_READLINK(vp, uiop, cred);
601	if (error) {
602		m_freem(mp3);
603		*lenp = 0;
604		goto out;
605	}
606	if (uiop->uio_resid > 0) {
607		len -= uiop->uio_resid;
608		tlen = NFSM_RNDUP(len);
609		nfsrv_adj(mp3, NFS_MAXPATHLEN - tlen, tlen - len);
610	}
611	*lenp = len;
612	*mpp = mp3;
613	*mpendp = mp;
614
615out:
616	NFSEXITCODE(error);
617	return (error);
618}
619
620/*
621 * Read vnode op call into mbuf list.
622 */
623int
624nfsvno_read(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
625    struct thread *p, struct mbuf **mpp, struct mbuf **mpendp)
626{
627	struct mbuf *m;
628	int i;
629	struct iovec *iv;
630	struct iovec *iv2;
631	int error = 0, len, left, siz, tlen, ioflag = 0;
632	struct mbuf *m2 = NULL, *m3;
633	struct uio io, *uiop = &io;
634	struct nfsheur *nh;
635
636	len = left = NFSM_RNDUP(cnt);
637	m3 = NULL;
638	/*
639	 * Generate the mbuf list with the uio_iov ref. to it.
640	 */
641	i = 0;
642	while (left > 0) {
643		NFSMGET(m);
644		MCLGET(m, M_WAIT);
645		m->m_len = 0;
646		siz = min(M_TRAILINGSPACE(m), left);
647		left -= siz;
648		i++;
649		if (m3)
650			m2->m_next = m;
651		else
652			m3 = m;
653		m2 = m;
654	}
655	MALLOC(iv, struct iovec *, i * sizeof (struct iovec),
656	    M_TEMP, M_WAITOK);
657	uiop->uio_iov = iv2 = iv;
658	m = m3;
659	left = len;
660	i = 0;
661	while (left > 0) {
662		if (m == NULL)
663			panic("nfsvno_read iov");
664		siz = min(M_TRAILINGSPACE(m), left);
665		if (siz > 0) {
666			iv->iov_base = mtod(m, caddr_t) + m->m_len;
667			iv->iov_len = siz;
668			m->m_len += siz;
669			left -= siz;
670			iv++;
671			i++;
672		}
673		m = m->m_next;
674	}
675	uiop->uio_iovcnt = i;
676	uiop->uio_offset = off;
677	uiop->uio_resid = len;
678	uiop->uio_rw = UIO_READ;
679	uiop->uio_segflg = UIO_SYSSPACE;
680	uiop->uio_td = NULL;
681	nh = nfsrv_sequential_heuristic(uiop, vp);
682	ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
683	error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred);
684	FREE((caddr_t)iv2, M_TEMP);
685	if (error) {
686		m_freem(m3);
687		*mpp = NULL;
688		goto out;
689	}
690	nh->nh_nextoff = uiop->uio_offset;
691	tlen = len - uiop->uio_resid;
692	cnt = cnt < tlen ? cnt : tlen;
693	tlen = NFSM_RNDUP(cnt);
694	if (tlen == 0) {
695		m_freem(m3);
696		m3 = NULL;
697	} else if (len != tlen || tlen != cnt)
698		nfsrv_adj(m3, len - tlen, tlen - cnt);
699	*mpp = m3;
700	*mpendp = m2;
701
702out:
703	NFSEXITCODE(error);
704	return (error);
705}
706
707/*
708 * Write vnode op from an mbuf list.
709 */
710int
711nfsvno_write(struct vnode *vp, off_t off, int retlen, int cnt, int stable,
712    struct mbuf *mp, char *cp, struct ucred *cred, struct thread *p)
713{
714	struct iovec *ivp;
715	int i, len;
716	struct iovec *iv;
717	int ioflags, error;
718	struct uio io, *uiop = &io;
719	struct nfsheur *nh;
720
721	MALLOC(ivp, struct iovec *, cnt * sizeof (struct iovec), M_TEMP,
722	    M_WAITOK);
723	uiop->uio_iov = iv = ivp;
724	uiop->uio_iovcnt = cnt;
725	i = mtod(mp, caddr_t) + mp->m_len - cp;
726	len = retlen;
727	while (len > 0) {
728		if (mp == NULL)
729			panic("nfsvno_write");
730		if (i > 0) {
731			i = min(i, len);
732			ivp->iov_base = cp;
733			ivp->iov_len = i;
734			ivp++;
735			len -= i;
736		}
737		mp = mp->m_next;
738		if (mp) {
739			i = mp->m_len;
740			cp = mtod(mp, caddr_t);
741		}
742	}
743
744	if (stable == NFSWRITE_UNSTABLE)
745		ioflags = IO_NODELOCKED;
746	else
747		ioflags = (IO_SYNC | IO_NODELOCKED);
748	uiop->uio_resid = retlen;
749	uiop->uio_rw = UIO_WRITE;
750	uiop->uio_segflg = UIO_SYSSPACE;
751	NFSUIOPROC(uiop, p);
752	uiop->uio_offset = off;
753	nh = nfsrv_sequential_heuristic(uiop, vp);
754	ioflags |= nh->nh_seqcount << IO_SEQSHIFT;
755	error = VOP_WRITE(vp, uiop, ioflags, cred);
756	if (error == 0)
757		nh->nh_nextoff = uiop->uio_offset;
758	FREE((caddr_t)iv, M_TEMP);
759
760	NFSEXITCODE(error);
761	return (error);
762}
763
764/*
765 * Common code for creating a regular file (plus special files for V2).
766 */
767int
768nfsvno_createsub(struct nfsrv_descript *nd, struct nameidata *ndp,
769    struct vnode **vpp, struct nfsvattr *nvap, int *exclusive_flagp,
770    int32_t *cverf, NFSDEV_T rdev, struct thread *p, struct nfsexstuff *exp)
771{
772	u_quad_t tempsize;
773	int error;
774
775	error = nd->nd_repstat;
776	if (!error && ndp->ni_vp == NULL) {
777		if (nvap->na_type == VREG || nvap->na_type == VSOCK) {
778			vrele(ndp->ni_startdir);
779			error = VOP_CREATE(ndp->ni_dvp,
780			    &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
781			vput(ndp->ni_dvp);
782			nfsvno_relpathbuf(ndp);
783			if (!error) {
784				if (*exclusive_flagp) {
785					*exclusive_flagp = 0;
786					NFSVNO_ATTRINIT(nvap);
787					nvap->na_atime.tv_sec = cverf[0];
788					nvap->na_atime.tv_nsec = cverf[1];
789					error = VOP_SETATTR(ndp->ni_vp,
790					    &nvap->na_vattr, nd->nd_cred);
791				}
792			}
793		/*
794		 * NFS V2 Only. nfsrvd_mknod() does this for V3.
795		 * (This implies, just get out on an error.)
796		 */
797		} else if (nvap->na_type == VCHR || nvap->na_type == VBLK ||
798			nvap->na_type == VFIFO) {
799			if (nvap->na_type == VCHR && rdev == 0xffffffff)
800				nvap->na_type = VFIFO;
801                        if (nvap->na_type != VFIFO &&
802			    (error = priv_check_cred(nd->nd_cred,
803			     PRIV_VFS_MKNOD_DEV, 0))) {
804				vrele(ndp->ni_startdir);
805				nfsvno_relpathbuf(ndp);
806				vput(ndp->ni_dvp);
807				goto out;
808			}
809			nvap->na_rdev = rdev;
810			error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
811			    &ndp->ni_cnd, &nvap->na_vattr);
812			vput(ndp->ni_dvp);
813			nfsvno_relpathbuf(ndp);
814			vrele(ndp->ni_startdir);
815			if (error)
816				goto out;
817		} else {
818			vrele(ndp->ni_startdir);
819			nfsvno_relpathbuf(ndp);
820			vput(ndp->ni_dvp);
821			error = ENXIO;
822			goto out;
823		}
824		*vpp = ndp->ni_vp;
825	} else {
826		/*
827		 * Handle cases where error is already set and/or
828		 * the file exists.
829		 * 1 - clean up the lookup
830		 * 2 - iff !error and na_size set, truncate it
831		 */
832		vrele(ndp->ni_startdir);
833		nfsvno_relpathbuf(ndp);
834		*vpp = ndp->ni_vp;
835		if (ndp->ni_dvp == *vpp)
836			vrele(ndp->ni_dvp);
837		else
838			vput(ndp->ni_dvp);
839		if (!error && nvap->na_size != VNOVAL) {
840			error = nfsvno_accchk(*vpp, VWRITE,
841			    nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
842			    NFSACCCHK_VPISLOCKED, NULL);
843			if (!error) {
844				tempsize = nvap->na_size;
845				NFSVNO_ATTRINIT(nvap);
846				nvap->na_size = tempsize;
847				error = VOP_SETATTR(*vpp,
848				    &nvap->na_vattr, nd->nd_cred);
849			}
850		}
851		if (error)
852			vput(*vpp);
853	}
854
855out:
856	NFSEXITCODE(error);
857	return (error);
858}
859
860/*
861 * Do a mknod vnode op.
862 */
863int
864nfsvno_mknod(struct nameidata *ndp, struct nfsvattr *nvap, struct ucred *cred,
865    struct thread *p)
866{
867	int error = 0;
868	enum vtype vtyp;
869
870	vtyp = nvap->na_type;
871	/*
872	 * Iff doesn't exist, create it.
873	 */
874	if (ndp->ni_vp) {
875		vrele(ndp->ni_startdir);
876		nfsvno_relpathbuf(ndp);
877		vput(ndp->ni_dvp);
878		vrele(ndp->ni_vp);
879		error = EEXIST;
880		goto out;
881	}
882	if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) {
883		vrele(ndp->ni_startdir);
884		nfsvno_relpathbuf(ndp);
885		vput(ndp->ni_dvp);
886		error = NFSERR_BADTYPE;
887		goto out;
888	}
889	if (vtyp == VSOCK) {
890		vrele(ndp->ni_startdir);
891		error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
892		    &ndp->ni_cnd, &nvap->na_vattr);
893		vput(ndp->ni_dvp);
894		nfsvno_relpathbuf(ndp);
895	} else {
896		if (nvap->na_type != VFIFO &&
897		    (error = priv_check_cred(cred, PRIV_VFS_MKNOD_DEV, 0))) {
898			vrele(ndp->ni_startdir);
899			nfsvno_relpathbuf(ndp);
900			vput(ndp->ni_dvp);
901			goto out;
902		}
903		error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
904		    &ndp->ni_cnd, &nvap->na_vattr);
905		vput(ndp->ni_dvp);
906		nfsvno_relpathbuf(ndp);
907		vrele(ndp->ni_startdir);
908		/*
909		 * Since VOP_MKNOD returns the ni_vp, I can't
910		 * see any reason to do the lookup.
911		 */
912	}
913
914out:
915	NFSEXITCODE(error);
916	return (error);
917}
918
919/*
920 * Mkdir vnode op.
921 */
922int
923nfsvno_mkdir(struct nameidata *ndp, struct nfsvattr *nvap, uid_t saved_uid,
924    struct ucred *cred, struct thread *p, struct nfsexstuff *exp)
925{
926	int error = 0;
927
928	if (ndp->ni_vp != NULL) {
929		if (ndp->ni_dvp == ndp->ni_vp)
930			vrele(ndp->ni_dvp);
931		else
932			vput(ndp->ni_dvp);
933		vrele(ndp->ni_vp);
934		nfsvno_relpathbuf(ndp);
935		error = EEXIST;
936		goto out;
937	}
938	error = VOP_MKDIR(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
939	    &nvap->na_vattr);
940	vput(ndp->ni_dvp);
941	nfsvno_relpathbuf(ndp);
942
943out:
944	NFSEXITCODE(error);
945	return (error);
946}
947
948/*
949 * symlink vnode op.
950 */
951int
952nfsvno_symlink(struct nameidata *ndp, struct nfsvattr *nvap, char *pathcp,
953    int pathlen, int not_v2, uid_t saved_uid, struct ucred *cred, struct thread *p,
954    struct nfsexstuff *exp)
955{
956	int error = 0;
957
958	if (ndp->ni_vp) {
959		vrele(ndp->ni_startdir);
960		nfsvno_relpathbuf(ndp);
961		if (ndp->ni_dvp == ndp->ni_vp)
962			vrele(ndp->ni_dvp);
963		else
964			vput(ndp->ni_dvp);
965		vrele(ndp->ni_vp);
966		error = EEXIST;
967		goto out;
968	}
969
970	error = VOP_SYMLINK(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
971	    &nvap->na_vattr, pathcp);
972	vput(ndp->ni_dvp);
973	vrele(ndp->ni_startdir);
974	nfsvno_relpathbuf(ndp);
975	/*
976	 * Although FreeBSD still had the lookup code in
977	 * it for 7/current, there doesn't seem to be any
978	 * point, since VOP_SYMLINK() returns the ni_vp.
979	 * Just vput it for v2.
980	 */
981	if (!not_v2 && !error)
982		vput(ndp->ni_vp);
983
984out:
985	NFSEXITCODE(error);
986	return (error);
987}
988
989/*
990 * Parse symbolic link arguments.
991 * This function has an ugly side effect. It will MALLOC() an area for
992 * the symlink and set iov_base to point to it, only if it succeeds.
993 * So, if it returns with uiop->uio_iov->iov_base != NULL, that must
994 * be FREE'd later.
995 */
996int
997nfsvno_getsymlink(struct nfsrv_descript *nd, struct nfsvattr *nvap,
998    struct thread *p, char **pathcpp, int *lenp)
999{
1000	u_int32_t *tl;
1001	char *pathcp = NULL;
1002	int error = 0, len;
1003	struct nfsv2_sattr *sp;
1004
1005	*pathcpp = NULL;
1006	*lenp = 0;
1007	if ((nd->nd_flag & ND_NFSV3) &&
1008	    (error = nfsrv_sattr(nd, nvap, NULL, NULL, p)))
1009		goto nfsmout;
1010	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1011	len = fxdr_unsigned(int, *tl);
1012	if (len > NFS_MAXPATHLEN || len <= 0) {
1013		error = EBADRPC;
1014		goto nfsmout;
1015	}
1016	MALLOC(pathcp, caddr_t, len + 1, M_TEMP, M_WAITOK);
1017	error = nfsrv_mtostr(nd, pathcp, len);
1018	if (error)
1019		goto nfsmout;
1020	if (nd->nd_flag & ND_NFSV2) {
1021		NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
1022		nvap->na_mode = fxdr_unsigned(u_int16_t, sp->sa_mode);
1023	}
1024	*pathcpp = pathcp;
1025	*lenp = len;
1026	NFSEXITCODE2(0, nd);
1027	return (0);
1028nfsmout:
1029	if (pathcp)
1030		free(pathcp, M_TEMP);
1031	NFSEXITCODE2(error, nd);
1032	return (error);
1033}
1034
1035/*
1036 * Remove a non-directory object.
1037 */
1038int
1039nfsvno_removesub(struct nameidata *ndp, int is_v4, struct ucred *cred,
1040    struct thread *p, struct nfsexstuff *exp)
1041{
1042	struct vnode *vp;
1043	int error = 0;
1044
1045	vp = ndp->ni_vp;
1046	if (vp->v_type == VDIR)
1047		error = NFSERR_ISDIR;
1048	else if (is_v4)
1049		error = nfsrv_checkremove(vp, 1, p);
1050	if (!error)
1051		error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd);
1052	if (ndp->ni_dvp == vp)
1053		vrele(ndp->ni_dvp);
1054	else
1055		vput(ndp->ni_dvp);
1056	vput(vp);
1057	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
1058		nfsvno_relpathbuf(ndp);
1059	NFSEXITCODE(error);
1060	return (error);
1061}
1062
1063/*
1064 * Remove a directory.
1065 */
1066int
1067nfsvno_rmdirsub(struct nameidata *ndp, int is_v4, struct ucred *cred,
1068    struct thread *p, struct nfsexstuff *exp)
1069{
1070	struct vnode *vp;
1071	int error = 0;
1072
1073	vp = ndp->ni_vp;
1074	if (vp->v_type != VDIR) {
1075		error = ENOTDIR;
1076		goto out;
1077	}
1078	/*
1079	 * No rmdir "." please.
1080	 */
1081	if (ndp->ni_dvp == vp) {
1082		error = EINVAL;
1083		goto out;
1084	}
1085	/*
1086	 * The root of a mounted filesystem cannot be deleted.
1087	 */
1088	if (vp->v_vflag & VV_ROOT)
1089		error = EBUSY;
1090out:
1091	if (!error)
1092		error = VOP_RMDIR(ndp->ni_dvp, vp, &ndp->ni_cnd);
1093	if (ndp->ni_dvp == vp)
1094		vrele(ndp->ni_dvp);
1095	else
1096		vput(ndp->ni_dvp);
1097	vput(vp);
1098	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
1099		nfsvno_relpathbuf(ndp);
1100	NFSEXITCODE(error);
1101	return (error);
1102}
1103
1104/*
1105 * Rename vnode op.
1106 */
1107int
1108nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp,
1109    u_int32_t ndstat, u_int32_t ndflag, struct ucred *cred, struct thread *p)
1110{
1111	struct vnode *fvp, *tvp, *tdvp;
1112	int error = 0;
1113
1114	fvp = fromndp->ni_vp;
1115	if (ndstat) {
1116		vrele(fromndp->ni_dvp);
1117		vrele(fvp);
1118		error = ndstat;
1119		goto out1;
1120	}
1121	tdvp = tondp->ni_dvp;
1122	tvp = tondp->ni_vp;
1123	if (tvp != NULL) {
1124		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
1125			error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST;
1126			goto out;
1127		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
1128			error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST;
1129			goto out;
1130		}
1131		if (tvp->v_type == VDIR && tvp->v_mountedhere) {
1132			error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
1133			goto out;
1134		}
1135
1136		/*
1137		 * A rename to '.' or '..' results in a prematurely
1138		 * unlocked vnode on FreeBSD5, so I'm just going to fail that
1139		 * here.
1140		 */
1141		if ((tondp->ni_cnd.cn_namelen == 1 &&
1142		     tondp->ni_cnd.cn_nameptr[0] == '.') ||
1143		    (tondp->ni_cnd.cn_namelen == 2 &&
1144		     tondp->ni_cnd.cn_nameptr[0] == '.' &&
1145		     tondp->ni_cnd.cn_nameptr[1] == '.')) {
1146			error = EINVAL;
1147			goto out;
1148		}
1149	}
1150	if (fvp->v_type == VDIR && fvp->v_mountedhere) {
1151		error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
1152		goto out;
1153	}
1154	if (fvp->v_mount != tdvp->v_mount) {
1155		error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
1156		goto out;
1157	}
1158	if (fvp == tdvp) {
1159		error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL;
1160		goto out;
1161	}
1162	if (fvp == tvp) {
1163		/*
1164		 * If source and destination are the same, there is nothing to
1165		 * do. Set error to -1 to indicate this.
1166		 */
1167		error = -1;
1168		goto out;
1169	}
1170	if (ndflag & ND_NFSV4) {
1171		if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) {
1172			error = nfsrv_checkremove(fvp, 0, p);
1173			NFSVOPUNLOCK(fvp, 0);
1174		} else
1175			error = EPERM;
1176		if (tvp && !error)
1177			error = nfsrv_checkremove(tvp, 1, p);
1178	} else {
1179		/*
1180		 * For NFSv2 and NFSv3, try to get rid of the delegation, so
1181		 * that the NFSv4 client won't be confused by the rename.
1182		 * Since nfsd_recalldelegation() can only be called on an
1183		 * unlocked vnode at this point and fvp is the file that will
1184		 * still exist after the rename, just do fvp.
1185		 */
1186		nfsd_recalldelegation(fvp, p);
1187	}
1188out:
1189	if (!error) {
1190		error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp,
1191		    &fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp,
1192		    &tondp->ni_cnd);
1193	} else {
1194		if (tdvp == tvp)
1195			vrele(tdvp);
1196		else
1197			vput(tdvp);
1198		if (tvp)
1199			vput(tvp);
1200		vrele(fromndp->ni_dvp);
1201		vrele(fvp);
1202		if (error == -1)
1203			error = 0;
1204	}
1205	vrele(tondp->ni_startdir);
1206	nfsvno_relpathbuf(tondp);
1207out1:
1208	vrele(fromndp->ni_startdir);
1209	nfsvno_relpathbuf(fromndp);
1210	NFSEXITCODE(error);
1211	return (error);
1212}
1213
1214/*
1215 * Link vnode op.
1216 */
1217int
1218nfsvno_link(struct nameidata *ndp, struct vnode *vp, struct ucred *cred,
1219    struct thread *p, struct nfsexstuff *exp)
1220{
1221	struct vnode *xp;
1222	int error = 0;
1223
1224	xp = ndp->ni_vp;
1225	if (xp != NULL) {
1226		error = EEXIST;
1227	} else {
1228		xp = ndp->ni_dvp;
1229		if (vp->v_mount != xp->v_mount)
1230			error = EXDEV;
1231	}
1232	if (!error) {
1233		NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
1234		if ((vp->v_iflag & VI_DOOMED) == 0)
1235			error = VOP_LINK(ndp->ni_dvp, vp, &ndp->ni_cnd);
1236		else
1237			error = EPERM;
1238		if (ndp->ni_dvp == vp)
1239			vrele(ndp->ni_dvp);
1240		else
1241			vput(ndp->ni_dvp);
1242		NFSVOPUNLOCK(vp, 0);
1243	} else {
1244		if (ndp->ni_dvp == ndp->ni_vp)
1245			vrele(ndp->ni_dvp);
1246		else
1247			vput(ndp->ni_dvp);
1248		if (ndp->ni_vp)
1249			vrele(ndp->ni_vp);
1250	}
1251	nfsvno_relpathbuf(ndp);
1252	NFSEXITCODE(error);
1253	return (error);
1254}
1255
1256/*
1257 * Do the fsync() appropriate for the commit.
1258 */
1259int
1260nfsvno_fsync(struct vnode *vp, u_int64_t off, int cnt, struct ucred *cred,
1261    struct thread *td)
1262{
1263	int error = 0;
1264
1265	/*
1266	 * RFC 1813 3.3.21: if count is 0, a flush from offset to the end of
1267	 * file is done.  At this time VOP_FSYNC does not accept offset and
1268	 * byte count parameters so call VOP_FSYNC the whole file for now.
1269	 * The same is true for NFSv4: RFC 3530 Sec. 14.2.3.
1270	 */
1271	if (cnt == 0 || cnt > MAX_COMMIT_COUNT) {
1272		/*
1273		 * Give up and do the whole thing
1274		 */
1275		if (vp->v_object &&
1276		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
1277			VM_OBJECT_LOCK(vp->v_object);
1278			vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
1279			VM_OBJECT_UNLOCK(vp->v_object);
1280		}
1281		error = VOP_FSYNC(vp, MNT_WAIT, td);
1282	} else {
1283		/*
1284		 * Locate and synchronously write any buffers that fall
1285		 * into the requested range.  Note:  we are assuming that
1286		 * f_iosize is a power of 2.
1287		 */
1288		int iosize = vp->v_mount->mnt_stat.f_iosize;
1289		int iomask = iosize - 1;
1290		struct bufobj *bo;
1291		daddr_t lblkno;
1292
1293		/*
1294		 * Align to iosize boundry, super-align to page boundry.
1295		 */
1296		if (off & iomask) {
1297			cnt += off & iomask;
1298			off &= ~(u_quad_t)iomask;
1299		}
1300		if (off & PAGE_MASK) {
1301			cnt += off & PAGE_MASK;
1302			off &= ~(u_quad_t)PAGE_MASK;
1303		}
1304		lblkno = off / iosize;
1305
1306		if (vp->v_object &&
1307		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
1308			VM_OBJECT_LOCK(vp->v_object);
1309			vm_object_page_clean(vp->v_object, off, off + cnt,
1310			    OBJPC_SYNC);
1311			VM_OBJECT_UNLOCK(vp->v_object);
1312		}
1313
1314		bo = &vp->v_bufobj;
1315		BO_LOCK(bo);
1316		while (cnt > 0) {
1317			struct buf *bp;
1318
1319			/*
1320			 * If we have a buffer and it is marked B_DELWRI we
1321			 * have to lock and write it.  Otherwise the prior
1322			 * write is assumed to have already been committed.
1323			 *
1324			 * gbincore() can return invalid buffers now so we
1325			 * have to check that bit as well (though B_DELWRI
1326			 * should not be set if B_INVAL is set there could be
1327			 * a race here since we haven't locked the buffer).
1328			 */
1329			if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) {
1330				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
1331				    LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) {
1332					BO_LOCK(bo);
1333					continue; /* retry */
1334				}
1335			    	if ((bp->b_flags & (B_DELWRI|B_INVAL)) ==
1336				    B_DELWRI) {
1337					bremfree(bp);
1338					bp->b_flags &= ~B_ASYNC;
1339					bwrite(bp);
1340					++nfs_commit_miss;
1341				} else
1342					BUF_UNLOCK(bp);
1343				BO_LOCK(bo);
1344			}
1345			++nfs_commit_blks;
1346			if (cnt < iosize)
1347				break;
1348			cnt -= iosize;
1349			++lblkno;
1350		}
1351		BO_UNLOCK(bo);
1352	}
1353	NFSEXITCODE(error);
1354	return (error);
1355}
1356
1357/*
1358 * Statfs vnode op.
1359 */
1360int
1361nfsvno_statfs(struct vnode *vp, struct statfs *sf)
1362{
1363	int error;
1364
1365	error = VFS_STATFS(vp->v_mount, sf);
1366	if (error == 0) {
1367		/*
1368		 * Since NFS handles these values as unsigned on the
1369		 * wire, there is no way to represent negative values,
1370		 * so set them to 0. Without this, they will appear
1371		 * to be very large positive values for clients like
1372		 * Solaris10.
1373		 */
1374		if (sf->f_bavail < 0)
1375			sf->f_bavail = 0;
1376		if (sf->f_ffree < 0)
1377			sf->f_ffree = 0;
1378	}
1379	NFSEXITCODE(error);
1380	return (error);
1381}
1382
1383/*
1384 * Do the vnode op stuff for Open. Similar to nfsvno_createsub(), but
1385 * must handle nfsrv_opencheck() calls after any other access checks.
1386 */
1387void
1388nfsvno_open(struct nfsrv_descript *nd, struct nameidata *ndp,
1389    nfsquad_t clientid, nfsv4stateid_t *stateidp, struct nfsstate *stp,
1390    int *exclusive_flagp, struct nfsvattr *nvap, int32_t *cverf, int create,
1391    NFSACL_T *aclp, nfsattrbit_t *attrbitp, struct ucred *cred, struct thread *p,
1392    struct nfsexstuff *exp, struct vnode **vpp)
1393{
1394	struct vnode *vp = NULL;
1395	u_quad_t tempsize;
1396	struct nfsexstuff nes;
1397
1398	if (ndp->ni_vp == NULL)
1399		nd->nd_repstat = nfsrv_opencheck(clientid,
1400		    stateidp, stp, NULL, nd, p, nd->nd_repstat);
1401	if (!nd->nd_repstat) {
1402		if (ndp->ni_vp == NULL) {
1403			vrele(ndp->ni_startdir);
1404			nd->nd_repstat = VOP_CREATE(ndp->ni_dvp,
1405			    &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
1406			vput(ndp->ni_dvp);
1407			nfsvno_relpathbuf(ndp);
1408			if (!nd->nd_repstat) {
1409				if (*exclusive_flagp) {
1410					*exclusive_flagp = 0;
1411					NFSVNO_ATTRINIT(nvap);
1412					nvap->na_atime.tv_sec = cverf[0];
1413					nvap->na_atime.tv_nsec = cverf[1];
1414					nd->nd_repstat = VOP_SETATTR(ndp->ni_vp,
1415					    &nvap->na_vattr, cred);
1416				} else {
1417					nfsrv_fixattr(nd, ndp->ni_vp, nvap,
1418					    aclp, p, attrbitp, exp);
1419				}
1420			}
1421			vp = ndp->ni_vp;
1422		} else {
1423			if (ndp->ni_startdir)
1424				vrele(ndp->ni_startdir);
1425			nfsvno_relpathbuf(ndp);
1426			vp = ndp->ni_vp;
1427			if (create == NFSV4OPEN_CREATE) {
1428				if (ndp->ni_dvp == vp)
1429					vrele(ndp->ni_dvp);
1430				else
1431					vput(ndp->ni_dvp);
1432			}
1433			if (NFSVNO_ISSETSIZE(nvap) && vp->v_type == VREG) {
1434				if (ndp->ni_cnd.cn_flags & RDONLY)
1435					NFSVNO_SETEXRDONLY(&nes);
1436				else
1437					NFSVNO_EXINIT(&nes);
1438				nd->nd_repstat = nfsvno_accchk(vp,
1439				    VWRITE, cred, &nes, p,
1440				    NFSACCCHK_NOOVERRIDE,
1441				    NFSACCCHK_VPISLOCKED, NULL);
1442				nd->nd_repstat = nfsrv_opencheck(clientid,
1443				    stateidp, stp, vp, nd, p, nd->nd_repstat);
1444				if (!nd->nd_repstat) {
1445					tempsize = nvap->na_size;
1446					NFSVNO_ATTRINIT(nvap);
1447					nvap->na_size = tempsize;
1448					nd->nd_repstat = VOP_SETATTR(vp,
1449					    &nvap->na_vattr, cred);
1450				}
1451			} else if (vp->v_type == VREG) {
1452				nd->nd_repstat = nfsrv_opencheck(clientid,
1453				    stateidp, stp, vp, nd, p, nd->nd_repstat);
1454			}
1455		}
1456	} else {
1457		if (ndp->ni_cnd.cn_flags & HASBUF)
1458			nfsvno_relpathbuf(ndp);
1459		if (ndp->ni_startdir && create == NFSV4OPEN_CREATE) {
1460			vrele(ndp->ni_startdir);
1461			if (ndp->ni_dvp == ndp->ni_vp)
1462				vrele(ndp->ni_dvp);
1463			else
1464				vput(ndp->ni_dvp);
1465			if (ndp->ni_vp)
1466				vput(ndp->ni_vp);
1467		}
1468	}
1469	*vpp = vp;
1470
1471	NFSEXITCODE2(0, nd);
1472}
1473
1474/*
1475 * Updates the file rev and sets the mtime and ctime
1476 * to the current clock time, returning the va_filerev and va_Xtime
1477 * values.
1478 * Return ESTALE to indicate the vnode is VI_DOOMED.
1479 */
1480int
1481nfsvno_updfilerev(struct vnode *vp, struct nfsvattr *nvap,
1482    struct ucred *cred, struct thread *p)
1483{
1484	struct vattr va;
1485
1486	VATTR_NULL(&va);
1487	vfs_timestamp(&va.va_mtime);
1488	if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
1489		NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY);
1490		if ((vp->v_iflag & VI_DOOMED) != 0)
1491			return (ESTALE);
1492	}
1493	(void) VOP_SETATTR(vp, &va, cred);
1494	(void) nfsvno_getattr(vp, nvap, cred, p, 1);
1495	return (0);
1496}
1497
1498/*
1499 * Glue routine to nfsv4_fillattr().
1500 */
1501int
1502nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
1503    struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp,
1504    struct ucred *cred, struct thread *p, int isdgram, int reterr,
1505    int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
1506{
1507	int error;
1508
1509	error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
1510	    attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
1511	    mounted_on_fileno);
1512	NFSEXITCODE2(0, nd);
1513	return (error);
1514}
1515
1516/* Since the Readdir vnode ops vary, put the entire functions in here. */
1517/*
1518 * nfs readdir service
1519 * - mallocs what it thinks is enough to read
1520 *	count rounded up to a multiple of DIRBLKSIZ <= NFS_MAXREADDIR
1521 * - calls VOP_READDIR()
1522 * - loops around building the reply
1523 *	if the output generated exceeds count break out of loop
1524 *	The NFSM_CLGET macro is used here so that the reply will be packed
1525 *	tightly in mbuf clusters.
1526 * - it trims out records with d_fileno == 0
1527 *	this doesn't matter for Unix clients, but they might confuse clients
1528 *	for other os'.
1529 * - it trims out records with d_type == DT_WHT
1530 *	these cannot be seen through NFS (unless we extend the protocol)
1531 *     The alternate call nfsrvd_readdirplus() does lookups as well.
1532 * PS: The NFS protocol spec. does not clarify what the "count" byte
1533 *	argument is a count of.. just name strings and file id's or the
1534 *	entire reply rpc or ...
1535 *	I tried just file name and id sizes and it confused the Sun client,
1536 *	so I am using the full rpc size now. The "paranoia.." comment refers
1537 *	to including the status longwords that are not a part of the dir.
1538 *	"entry" structures, but are in the rpc.
1539 */
1540int
1541nfsrvd_readdir(struct nfsrv_descript *nd, int isdgram,
1542    struct vnode *vp, struct thread *p, struct nfsexstuff *exp)
1543{
1544	struct dirent *dp;
1545	u_int32_t *tl;
1546	int dirlen;
1547	char *cpos, *cend, *rbuf;
1548	struct nfsvattr at;
1549	int nlen, error = 0, getret = 1;
1550	int siz, cnt, fullsiz, eofflag, ncookies;
1551	u_int64_t off, toff, verf;
1552	u_long *cookies = NULL, *cookiep;
1553	struct uio io;
1554	struct iovec iv;
1555	int not_zfs;
1556
1557	if (nd->nd_repstat) {
1558		nfsrv_postopattr(nd, getret, &at);
1559		goto out;
1560	}
1561	if (nd->nd_flag & ND_NFSV2) {
1562		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1563		off = fxdr_unsigned(u_quad_t, *tl++);
1564	} else {
1565		NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
1566		off = fxdr_hyper(tl);
1567		tl += 2;
1568		verf = fxdr_hyper(tl);
1569		tl += 2;
1570	}
1571	toff = off;
1572	cnt = fxdr_unsigned(int, *tl);
1573	if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
1574		cnt = NFS_SRVMAXDATA(nd);
1575	siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
1576	fullsiz = siz;
1577	if (nd->nd_flag & ND_NFSV3) {
1578		nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred,
1579		    p, 1);
1580#if 0
1581		/*
1582		 * va_filerev is not sufficient as a cookie verifier,
1583		 * since it is not supposed to change when entries are
1584		 * removed/added unless that offset cookies returned to
1585		 * the client are no longer valid.
1586		 */
1587		if (!nd->nd_repstat && toff && verf != at.na_filerev)
1588			nd->nd_repstat = NFSERR_BAD_COOKIE;
1589#endif
1590	}
1591	if (!nd->nd_repstat && vp->v_type != VDIR)
1592		nd->nd_repstat = NFSERR_NOTDIR;
1593	if (nd->nd_repstat == 0 && cnt == 0) {
1594		if (nd->nd_flag & ND_NFSV2)
1595			/* NFSv2 does not have NFSERR_TOOSMALL */
1596			nd->nd_repstat = EPERM;
1597		else
1598			nd->nd_repstat = NFSERR_TOOSMALL;
1599	}
1600	if (!nd->nd_repstat)
1601		nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
1602		    nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
1603		    NFSACCCHK_VPISLOCKED, NULL);
1604	if (nd->nd_repstat) {
1605		vput(vp);
1606		if (nd->nd_flag & ND_NFSV3)
1607			nfsrv_postopattr(nd, getret, &at);
1608		goto out;
1609	}
1610	not_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs");
1611	MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
1612again:
1613	eofflag = 0;
1614	if (cookies) {
1615		free((caddr_t)cookies, M_TEMP);
1616		cookies = NULL;
1617	}
1618
1619	iv.iov_base = rbuf;
1620	iv.iov_len = siz;
1621	io.uio_iov = &iv;
1622	io.uio_iovcnt = 1;
1623	io.uio_offset = (off_t)off;
1624	io.uio_resid = siz;
1625	io.uio_segflg = UIO_SYSSPACE;
1626	io.uio_rw = UIO_READ;
1627	io.uio_td = NULL;
1628	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
1629	    &cookies);
1630	off = (u_int64_t)io.uio_offset;
1631	if (io.uio_resid)
1632		siz -= io.uio_resid;
1633
1634	if (!cookies && !nd->nd_repstat)
1635		nd->nd_repstat = NFSERR_PERM;
1636	if (nd->nd_flag & ND_NFSV3) {
1637		getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
1638		if (!nd->nd_repstat)
1639			nd->nd_repstat = getret;
1640	}
1641
1642	/*
1643	 * Handles the failed cases. nd->nd_repstat == 0 past here.
1644	 */
1645	if (nd->nd_repstat) {
1646		vput(vp);
1647		free((caddr_t)rbuf, M_TEMP);
1648		if (cookies)
1649			free((caddr_t)cookies, M_TEMP);
1650		if (nd->nd_flag & ND_NFSV3)
1651			nfsrv_postopattr(nd, getret, &at);
1652		goto out;
1653	}
1654	/*
1655	 * If nothing read, return eof
1656	 * rpc reply
1657	 */
1658	if (siz == 0) {
1659		vput(vp);
1660		if (nd->nd_flag & ND_NFSV2) {
1661			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1662		} else {
1663			nfsrv_postopattr(nd, getret, &at);
1664			NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
1665			txdr_hyper(at.na_filerev, tl);
1666			tl += 2;
1667		}
1668		*tl++ = newnfs_false;
1669		*tl = newnfs_true;
1670		FREE((caddr_t)rbuf, M_TEMP);
1671		FREE((caddr_t)cookies, M_TEMP);
1672		goto out;
1673	}
1674
1675	/*
1676	 * Check for degenerate cases of nothing useful read.
1677	 * If so go try again
1678	 */
1679	cpos = rbuf;
1680	cend = rbuf + siz;
1681	dp = (struct dirent *)cpos;
1682	cookiep = cookies;
1683
1684	/*
1685	 * For some reason FreeBSD's ufs_readdir() chooses to back the
1686	 * directory offset up to a block boundary, so it is necessary to
1687	 * skip over the records that precede the requested offset. This
1688	 * requires the assumption that file offset cookies monotonically
1689	 * increase.
1690	 * Since the offset cookies don't monotonically increase for ZFS,
1691	 * this is not done when ZFS is the file system.
1692	 */
1693	while (cpos < cend && ncookies > 0 &&
1694	    (dp->d_fileno == 0 || dp->d_type == DT_WHT ||
1695	     (not_zfs != 0 && ((u_quad_t)(*cookiep)) <= toff))) {
1696		cpos += dp->d_reclen;
1697		dp = (struct dirent *)cpos;
1698		cookiep++;
1699		ncookies--;
1700	}
1701	if (cpos >= cend || ncookies == 0) {
1702		siz = fullsiz;
1703		toff = off;
1704		goto again;
1705	}
1706	vput(vp);
1707
1708	/*
1709	 * dirlen is the size of the reply, including all XDR and must
1710	 * not exceed cnt. For NFSv2, RFC1094 didn't clearly indicate
1711	 * if the XDR should be included in "count", but to be safe, we do.
1712	 * (Include the two booleans at the end of the reply in dirlen now.)
1713	 */
1714	if (nd->nd_flag & ND_NFSV3) {
1715		nfsrv_postopattr(nd, getret, &at);
1716		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1717		txdr_hyper(at.na_filerev, tl);
1718		dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
1719	} else {
1720		dirlen = 2 * NFSX_UNSIGNED;
1721	}
1722
1723	/* Loop through the records and build reply */
1724	while (cpos < cend && ncookies > 0) {
1725		nlen = dp->d_namlen;
1726		if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
1727			nlen <= NFS_MAXNAMLEN) {
1728			if (nd->nd_flag & ND_NFSV3)
1729				dirlen += (6*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
1730			else
1731				dirlen += (4*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
1732			if (dirlen > cnt) {
1733				eofflag = 0;
1734				break;
1735			}
1736
1737			/*
1738			 * Build the directory record xdr from
1739			 * the dirent entry.
1740			 */
1741			if (nd->nd_flag & ND_NFSV3) {
1742				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1743				*tl++ = newnfs_true;
1744				*tl++ = 0;
1745			} else {
1746				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1747				*tl++ = newnfs_true;
1748			}
1749			*tl = txdr_unsigned(dp->d_fileno);
1750			(void) nfsm_strtom(nd, dp->d_name, nlen);
1751			if (nd->nd_flag & ND_NFSV3) {
1752				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1753				*tl++ = 0;
1754			} else
1755				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
1756			*tl = txdr_unsigned(*cookiep);
1757		}
1758		cpos += dp->d_reclen;
1759		dp = (struct dirent *)cpos;
1760		cookiep++;
1761		ncookies--;
1762	}
1763	if (cpos < cend)
1764		eofflag = 0;
1765	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1766	*tl++ = newnfs_false;
1767	if (eofflag)
1768		*tl = newnfs_true;
1769	else
1770		*tl = newnfs_false;
1771	FREE((caddr_t)rbuf, M_TEMP);
1772	FREE((caddr_t)cookies, M_TEMP);
1773
1774out:
1775	NFSEXITCODE2(0, nd);
1776	return (0);
1777nfsmout:
1778	vput(vp);
1779	NFSEXITCODE2(error, nd);
1780	return (error);
1781}
1782
1783/*
1784 * Readdirplus for V3 and Readdir for V4.
1785 */
1786int
1787nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
1788    struct vnode *vp, struct thread *p, struct nfsexstuff *exp)
1789{
1790	struct dirent *dp;
1791	u_int32_t *tl;
1792	int dirlen;
1793	char *cpos, *cend, *rbuf;
1794	struct vnode *nvp;
1795	fhandle_t nfh;
1796	struct nfsvattr nva, at, *nvap = &nva;
1797	struct mbuf *mb0, *mb1;
1798	struct nfsreferral *refp;
1799	int nlen, r, error = 0, getret = 1, usevget = 1;
1800	int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
1801	caddr_t bpos0, bpos1;
1802	u_int64_t off, toff, verf;
1803	u_long *cookies = NULL, *cookiep;
1804	nfsattrbit_t attrbits, rderrbits, savbits;
1805	struct uio io;
1806	struct iovec iv;
1807	struct componentname cn;
1808	int at_root, needs_unbusy, not_zfs, supports_nfsv4acls;
1809	struct mount *mp, *new_mp;
1810	uint64_t mounted_on_fileno;
1811
1812	if (nd->nd_repstat) {
1813		nfsrv_postopattr(nd, getret, &at);
1814		goto out;
1815	}
1816	NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
1817	off = fxdr_hyper(tl);
1818	toff = off;
1819	tl += 2;
1820	verf = fxdr_hyper(tl);
1821	tl += 2;
1822	siz = fxdr_unsigned(int, *tl++);
1823	cnt = fxdr_unsigned(int, *tl);
1824
1825	/*
1826	 * Use the server's maximum data transfer size as the upper bound
1827	 * on reply datalen.
1828	 */
1829	if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
1830		cnt = NFS_SRVMAXDATA(nd);
1831
1832	/*
1833	 * siz is a "hint" of how much directory information (name, fileid,
1834	 * cookie) should be in the reply. At least one client "hints" 0,
1835	 * so I set it to cnt for that case. I also round it up to the
1836	 * next multiple of DIRBLKSIZ.
1837	 */
1838	if (siz <= 0)
1839		siz = cnt;
1840	siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
1841
1842	if (nd->nd_flag & ND_NFSV4) {
1843		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
1844		if (error)
1845			goto nfsmout;
1846		NFSSET_ATTRBIT(&savbits, &attrbits);
1847		NFSCLRNOTFILLABLE_ATTRBIT(&attrbits);
1848		NFSZERO_ATTRBIT(&rderrbits);
1849		NFSSETBIT_ATTRBIT(&rderrbits, NFSATTRBIT_RDATTRERROR);
1850	} else {
1851		NFSZERO_ATTRBIT(&attrbits);
1852	}
1853	fullsiz = siz;
1854	nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
1855	if (!nd->nd_repstat) {
1856	    if (off && verf != at.na_filerev) {
1857		/*
1858		 * va_filerev is not sufficient as a cookie verifier,
1859		 * since it is not supposed to change when entries are
1860		 * removed/added unless that offset cookies returned to
1861		 * the client are no longer valid.
1862		 */
1863#if 0
1864		if (nd->nd_flag & ND_NFSV4) {
1865			nd->nd_repstat = NFSERR_NOTSAME;
1866		} else {
1867			nd->nd_repstat = NFSERR_BAD_COOKIE;
1868		}
1869#endif
1870	    } else if ((nd->nd_flag & ND_NFSV4) && off == 0 && verf != 0) {
1871		nd->nd_repstat = NFSERR_BAD_COOKIE;
1872	    }
1873	}
1874	if (!nd->nd_repstat && vp->v_type != VDIR)
1875		nd->nd_repstat = NFSERR_NOTDIR;
1876	if (!nd->nd_repstat && cnt == 0)
1877		nd->nd_repstat = NFSERR_TOOSMALL;
1878	if (!nd->nd_repstat)
1879		nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
1880		    nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
1881		    NFSACCCHK_VPISLOCKED, NULL);
1882	if (nd->nd_repstat) {
1883		vput(vp);
1884		if (nd->nd_flag & ND_NFSV3)
1885			nfsrv_postopattr(nd, getret, &at);
1886		goto out;
1887	}
1888	not_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs");
1889
1890	MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
1891again:
1892	eofflag = 0;
1893	if (cookies) {
1894		free((caddr_t)cookies, M_TEMP);
1895		cookies = NULL;
1896	}
1897
1898	iv.iov_base = rbuf;
1899	iv.iov_len = siz;
1900	io.uio_iov = &iv;
1901	io.uio_iovcnt = 1;
1902	io.uio_offset = (off_t)off;
1903	io.uio_resid = siz;
1904	io.uio_segflg = UIO_SYSSPACE;
1905	io.uio_rw = UIO_READ;
1906	io.uio_td = NULL;
1907	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
1908	    &cookies);
1909	off = (u_int64_t)io.uio_offset;
1910	if (io.uio_resid)
1911		siz -= io.uio_resid;
1912
1913	getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
1914
1915	if (!cookies && !nd->nd_repstat)
1916		nd->nd_repstat = NFSERR_PERM;
1917	if (!nd->nd_repstat)
1918		nd->nd_repstat = getret;
1919	if (nd->nd_repstat) {
1920		vput(vp);
1921		if (cookies)
1922			free((caddr_t)cookies, M_TEMP);
1923		free((caddr_t)rbuf, M_TEMP);
1924		if (nd->nd_flag & ND_NFSV3)
1925			nfsrv_postopattr(nd, getret, &at);
1926		goto out;
1927	}
1928	/*
1929	 * If nothing read, return eof
1930	 * rpc reply
1931	 */
1932	if (siz == 0) {
1933		vput(vp);
1934		if (nd->nd_flag & ND_NFSV3)
1935			nfsrv_postopattr(nd, getret, &at);
1936		NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
1937		txdr_hyper(at.na_filerev, tl);
1938		tl += 2;
1939		*tl++ = newnfs_false;
1940		*tl = newnfs_true;
1941		free((caddr_t)cookies, M_TEMP);
1942		free((caddr_t)rbuf, M_TEMP);
1943		goto out;
1944	}
1945
1946	/*
1947	 * Check for degenerate cases of nothing useful read.
1948	 * If so go try again
1949	 */
1950	cpos = rbuf;
1951	cend = rbuf + siz;
1952	dp = (struct dirent *)cpos;
1953	cookiep = cookies;
1954
1955	/*
1956	 * For some reason FreeBSD's ufs_readdir() chooses to back the
1957	 * directory offset up to a block boundary, so it is necessary to
1958	 * skip over the records that precede the requested offset. This
1959	 * requires the assumption that file offset cookies monotonically
1960	 * increase.
1961	 * Since the offset cookies don't monotonically increase for ZFS,
1962	 * this is not done when ZFS is the file system.
1963	 */
1964	while (cpos < cend && ncookies > 0 &&
1965	  (dp->d_fileno == 0 || dp->d_type == DT_WHT ||
1966	   (not_zfs != 0 && ((u_quad_t)(*cookiep)) <= toff) ||
1967	   ((nd->nd_flag & ND_NFSV4) &&
1968	    ((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
1969	     (dp->d_namlen==2 && dp->d_name[0]=='.' && dp->d_name[1]=='.'))))) {
1970		cpos += dp->d_reclen;
1971		dp = (struct dirent *)cpos;
1972		cookiep++;
1973		ncookies--;
1974	}
1975	if (cpos >= cend || ncookies == 0) {
1976		siz = fullsiz;
1977		toff = off;
1978		goto again;
1979	}
1980
1981	/*
1982	 * Busy the file system so that the mount point won't go away
1983	 * and, as such, VFS_VGET() can be used safely.
1984	 */
1985	mp = vp->v_mount;
1986	vfs_ref(mp);
1987	NFSVOPUNLOCK(vp, 0);
1988	nd->nd_repstat = vfs_busy(mp, 0);
1989	vfs_rel(mp);
1990	if (nd->nd_repstat != 0) {
1991		vrele(vp);
1992		free(cookies, M_TEMP);
1993		free(rbuf, M_TEMP);
1994		if (nd->nd_flag & ND_NFSV3)
1995			nfsrv_postopattr(nd, getret, &at);
1996		goto out;
1997	}
1998
1999	/*
2000	 * Check to see if entries in this directory can be safely acquired
2001	 * via VFS_VGET() or if a switch to VOP_LOOKUP() is required.
2002	 * ZFS snapshot directories need VOP_LOOKUP(), so that any
2003	 * automount of the snapshot directory that is required will
2004	 * be done.
2005	 * This needs to be done here for NFSv4, since NFSv4 never does
2006	 * a VFS_VGET() for "." or "..".
2007	 */
2008	if (not_zfs == 0) {
2009		r = VFS_VGET(mp, at.na_fileid, LK_SHARED, &nvp);
2010		if (r == EOPNOTSUPP) {
2011			usevget = 0;
2012			cn.cn_nameiop = LOOKUP;
2013			cn.cn_lkflags = LK_SHARED | LK_RETRY;
2014			cn.cn_cred = nd->nd_cred;
2015			cn.cn_thread = p;
2016		} else if (r == 0)
2017			vput(nvp);
2018	}
2019
2020	/*
2021	 * Save this position, in case there is an error before one entry
2022	 * is created.
2023	 */
2024	mb0 = nd->nd_mb;
2025	bpos0 = nd->nd_bpos;
2026
2027	/*
2028	 * Fill in the first part of the reply.
2029	 * dirlen is the reply length in bytes and cannot exceed cnt.
2030	 * (Include the two booleans at the end of the reply in dirlen now,
2031	 *  so we recognize when we have exceeded cnt.)
2032	 */
2033	if (nd->nd_flag & ND_NFSV3) {
2034		dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
2035		nfsrv_postopattr(nd, getret, &at);
2036	} else {
2037		dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED;
2038	}
2039	NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
2040	txdr_hyper(at.na_filerev, tl);
2041
2042	/*
2043	 * Save this position, in case there is an empty reply needed.
2044	 */
2045	mb1 = nd->nd_mb;
2046	bpos1 = nd->nd_bpos;
2047
2048	/* Loop through the records and build reply */
2049	entrycnt = 0;
2050	while (cpos < cend && ncookies > 0 && dirlen < cnt) {
2051		nlen = dp->d_namlen;
2052		if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
2053		    nlen <= NFS_MAXNAMLEN &&
2054		    ((nd->nd_flag & ND_NFSV3) || nlen > 2 ||
2055		     (nlen==2 && (dp->d_name[0]!='.' || dp->d_name[1]!='.'))
2056		      || (nlen == 1 && dp->d_name[0] != '.'))) {
2057			/*
2058			 * Save the current position in the reply, in case
2059			 * this entry exceeds cnt.
2060			 */
2061			mb1 = nd->nd_mb;
2062			bpos1 = nd->nd_bpos;
2063
2064			/*
2065			 * For readdir_and_lookup get the vnode using
2066			 * the file number.
2067			 */
2068			nvp = NULL;
2069			refp = NULL;
2070			r = 0;
2071			at_root = 0;
2072			needs_unbusy = 0;
2073			new_mp = mp;
2074			mounted_on_fileno = (uint64_t)dp->d_fileno;
2075			if ((nd->nd_flag & ND_NFSV3) ||
2076			    NFSNONZERO_ATTRBIT(&savbits)) {
2077				if (nd->nd_flag & ND_NFSV4)
2078					refp = nfsv4root_getreferral(NULL,
2079					    vp, dp->d_fileno);
2080				if (refp == NULL) {
2081					if (usevget)
2082						r = VFS_VGET(mp, dp->d_fileno,
2083						    LK_SHARED, &nvp);
2084					else
2085						r = EOPNOTSUPP;
2086					if (r == EOPNOTSUPP) {
2087						if (usevget) {
2088							usevget = 0;
2089							cn.cn_nameiop = LOOKUP;
2090							cn.cn_lkflags =
2091							    LK_SHARED |
2092							    LK_RETRY;
2093							cn.cn_cred =
2094							    nd->nd_cred;
2095							cn.cn_thread = p;
2096						}
2097						cn.cn_nameptr = dp->d_name;
2098						cn.cn_namelen = nlen;
2099						cn.cn_flags = ISLASTCN |
2100						    NOFOLLOW | LOCKLEAF |
2101						    MPSAFE;
2102						if (nlen == 2 &&
2103						    dp->d_name[0] == '.' &&
2104						    dp->d_name[1] == '.')
2105							cn.cn_flags |=
2106							    ISDOTDOT;
2107						if (NFSVOPLOCK(vp, LK_SHARED)
2108						    != 0) {
2109							nd->nd_repstat = EPERM;
2110							break;
2111						}
2112						if ((vp->v_vflag & VV_ROOT) != 0
2113						    && (cn.cn_flags & ISDOTDOT)
2114						    != 0) {
2115							vref(vp);
2116							nvp = vp;
2117							r = 0;
2118						} else {
2119							r = VOP_LOOKUP(vp, &nvp,
2120							    &cn);
2121							if (vp != nvp)
2122								NFSVOPUNLOCK(vp,
2123								    0);
2124						}
2125					}
2126
2127					/*
2128					 * For NFSv4, check to see if nvp is
2129					 * a mount point and get the mount
2130					 * point vnode, as required.
2131					 */
2132					if (r == 0 &&
2133					    nfsrv_enable_crossmntpt != 0 &&
2134					    (nd->nd_flag & ND_NFSV4) != 0 &&
2135					    nvp->v_type == VDIR &&
2136					    nvp->v_mountedhere != NULL) {
2137						new_mp = nvp->v_mountedhere;
2138						r = vfs_busy(new_mp, 0);
2139						vput(nvp);
2140						nvp = NULL;
2141						if (r == 0) {
2142							r = VFS_ROOT(new_mp,
2143							    LK_SHARED, &nvp);
2144							needs_unbusy = 1;
2145							if (r == 0)
2146								at_root = 1;
2147						}
2148					}
2149				}
2150				if (!r) {
2151				    if (refp == NULL &&
2152					((nd->nd_flag & ND_NFSV3) ||
2153					 NFSNONZERO_ATTRBIT(&attrbits))) {
2154					r = nfsvno_getfh(nvp, &nfh, p);
2155					if (!r)
2156					    r = nfsvno_getattr(nvp, nvap,
2157						nd->nd_cred, p, 1);
2158					if (r == 0 && not_zfs == 0 &&
2159					    nfsrv_enable_crossmntpt != 0 &&
2160					    (nd->nd_flag & ND_NFSV4) != 0 &&
2161					    nvp->v_type == VDIR &&
2162					    vp->v_mount != nvp->v_mount) {
2163					    /*
2164					     * For a ZFS snapshot, there is a
2165					     * pseudo mount that does not set
2166					     * v_mountedhere, so it needs to
2167					     * be detected via a different
2168					     * mount structure.
2169					     */
2170					    at_root = 1;
2171					    if (new_mp == mp)
2172						new_mp = nvp->v_mount;
2173					}
2174				    }
2175				} else {
2176				    nvp = NULL;
2177				}
2178				if (r) {
2179					if (!NFSISSET_ATTRBIT(&attrbits,
2180					    NFSATTRBIT_RDATTRERROR)) {
2181						if (nvp != NULL)
2182							vput(nvp);
2183						if (needs_unbusy != 0)
2184							vfs_unbusy(new_mp);
2185						nd->nd_repstat = r;
2186						break;
2187					}
2188				}
2189			}
2190
2191			/*
2192			 * Build the directory record xdr
2193			 */
2194			if (nd->nd_flag & ND_NFSV3) {
2195				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
2196				*tl++ = newnfs_true;
2197				*tl++ = 0;
2198				*tl = txdr_unsigned(dp->d_fileno);
2199				dirlen += nfsm_strtom(nd, dp->d_name, nlen);
2200				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2201				*tl++ = 0;
2202				*tl = txdr_unsigned(*cookiep);
2203				nfsrv_postopattr(nd, 0, nvap);
2204				dirlen += nfsm_fhtom(nd,(u_int8_t *)&nfh,0,1);
2205				dirlen += (5*NFSX_UNSIGNED+NFSX_V3POSTOPATTR);
2206				if (nvp != NULL)
2207					vput(nvp);
2208			} else {
2209				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
2210				*tl++ = newnfs_true;
2211				*tl++ = 0;
2212				*tl = txdr_unsigned(*cookiep);
2213				dirlen += nfsm_strtom(nd, dp->d_name, nlen);
2214				if (nvp != NULL) {
2215					supports_nfsv4acls =
2216					    nfs_supportsnfsv4acls(nvp);
2217					NFSVOPUNLOCK(nvp, 0);
2218				} else
2219					supports_nfsv4acls = 0;
2220				if (refp != NULL) {
2221					dirlen += nfsrv_putreferralattr(nd,
2222					    &savbits, refp, 0,
2223					    &nd->nd_repstat);
2224					if (nd->nd_repstat) {
2225						if (nvp != NULL)
2226							vrele(nvp);
2227						if (needs_unbusy != 0)
2228							vfs_unbusy(new_mp);
2229						break;
2230					}
2231				} else if (r) {
2232					dirlen += nfsvno_fillattr(nd, new_mp,
2233					    nvp, nvap, &nfh, r, &rderrbits,
2234					    nd->nd_cred, p, isdgram, 0,
2235					    supports_nfsv4acls, at_root,
2236					    mounted_on_fileno);
2237				} else {
2238					dirlen += nfsvno_fillattr(nd, new_mp,
2239					    nvp, nvap, &nfh, r, &attrbits,
2240					    nd->nd_cred, p, isdgram, 0,
2241					    supports_nfsv4acls, at_root,
2242					    mounted_on_fileno);
2243				}
2244				if (nvp != NULL)
2245					vrele(nvp);
2246				dirlen += (3 * NFSX_UNSIGNED);
2247			}
2248			if (needs_unbusy != 0)
2249				vfs_unbusy(new_mp);
2250			if (dirlen <= cnt)
2251				entrycnt++;
2252		}
2253		cpos += dp->d_reclen;
2254		dp = (struct dirent *)cpos;
2255		cookiep++;
2256		ncookies--;
2257	}
2258	vrele(vp);
2259	vfs_unbusy(mp);
2260
2261	/*
2262	 * If dirlen > cnt, we must strip off the last entry. If that
2263	 * results in an empty reply, report NFSERR_TOOSMALL.
2264	 */
2265	if (dirlen > cnt || nd->nd_repstat) {
2266		if (!nd->nd_repstat && entrycnt == 0)
2267			nd->nd_repstat = NFSERR_TOOSMALL;
2268		if (nd->nd_repstat)
2269			newnfs_trimtrailing(nd, mb0, bpos0);
2270		else
2271			newnfs_trimtrailing(nd, mb1, bpos1);
2272		eofflag = 0;
2273	} else if (cpos < cend)
2274		eofflag = 0;
2275	if (!nd->nd_repstat) {
2276		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2277		*tl++ = newnfs_false;
2278		if (eofflag)
2279			*tl = newnfs_true;
2280		else
2281			*tl = newnfs_false;
2282	}
2283	FREE((caddr_t)cookies, M_TEMP);
2284	FREE((caddr_t)rbuf, M_TEMP);
2285
2286out:
2287	NFSEXITCODE2(0, nd);
2288	return (0);
2289nfsmout:
2290	vput(vp);
2291	NFSEXITCODE2(error, nd);
2292	return (error);
2293}
2294
2295/*
2296 * Get the settable attributes out of the mbuf list.
2297 * (Return 0 or EBADRPC)
2298 */
2299int
2300nfsrv_sattr(struct nfsrv_descript *nd, struct nfsvattr *nvap,
2301    nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
2302{
2303	u_int32_t *tl;
2304	struct nfsv2_sattr *sp;
2305	int error = 0, toclient = 0;
2306
2307	switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) {
2308	case ND_NFSV2:
2309		NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
2310		/*
2311		 * Some old clients didn't fill in the high order 16bits.
2312		 * --> check the low order 2 bytes for 0xffff
2313		 */
2314		if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff)
2315			nvap->na_mode = nfstov_mode(sp->sa_mode);
2316		if (sp->sa_uid != newnfs_xdrneg1)
2317			nvap->na_uid = fxdr_unsigned(uid_t, sp->sa_uid);
2318		if (sp->sa_gid != newnfs_xdrneg1)
2319			nvap->na_gid = fxdr_unsigned(gid_t, sp->sa_gid);
2320		if (sp->sa_size != newnfs_xdrneg1)
2321			nvap->na_size = fxdr_unsigned(u_quad_t, sp->sa_size);
2322		if (sp->sa_atime.nfsv2_sec != newnfs_xdrneg1) {
2323#ifdef notyet
2324			fxdr_nfsv2time(&sp->sa_atime, &nvap->na_atime);
2325#else
2326			nvap->na_atime.tv_sec =
2327				fxdr_unsigned(u_int32_t,sp->sa_atime.nfsv2_sec);
2328			nvap->na_atime.tv_nsec = 0;
2329#endif
2330		}
2331		if (sp->sa_mtime.nfsv2_sec != newnfs_xdrneg1)
2332			fxdr_nfsv2time(&sp->sa_mtime, &nvap->na_mtime);
2333		break;
2334	case ND_NFSV3:
2335		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2336		if (*tl == newnfs_true) {
2337			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2338			nvap->na_mode = nfstov_mode(*tl);
2339		}
2340		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2341		if (*tl == newnfs_true) {
2342			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2343			nvap->na_uid = fxdr_unsigned(uid_t, *tl);
2344		}
2345		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2346		if (*tl == newnfs_true) {
2347			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2348			nvap->na_gid = fxdr_unsigned(gid_t, *tl);
2349		}
2350		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2351		if (*tl == newnfs_true) {
2352			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2353			nvap->na_size = fxdr_hyper(tl);
2354		}
2355		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2356		switch (fxdr_unsigned(int, *tl)) {
2357		case NFSV3SATTRTIME_TOCLIENT:
2358			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2359			fxdr_nfsv3time(tl, &nvap->na_atime);
2360			toclient = 1;
2361			break;
2362		case NFSV3SATTRTIME_TOSERVER:
2363			vfs_timestamp(&nvap->na_atime);
2364			nvap->na_vaflags |= VA_UTIMES_NULL;
2365			break;
2366		};
2367		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2368		switch (fxdr_unsigned(int, *tl)) {
2369		case NFSV3SATTRTIME_TOCLIENT:
2370			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2371			fxdr_nfsv3time(tl, &nvap->na_mtime);
2372			nvap->na_vaflags &= ~VA_UTIMES_NULL;
2373			break;
2374		case NFSV3SATTRTIME_TOSERVER:
2375			vfs_timestamp(&nvap->na_mtime);
2376			if (!toclient)
2377				nvap->na_vaflags |= VA_UTIMES_NULL;
2378			break;
2379		};
2380		break;
2381	case ND_NFSV4:
2382		error = nfsv4_sattr(nd, nvap, attrbitp, aclp, p);
2383	};
2384nfsmout:
2385	NFSEXITCODE2(error, nd);
2386	return (error);
2387}
2388
2389/*
2390 * Handle the setable attributes for V4.
2391 * Returns NFSERR_BADXDR if it can't be parsed, 0 otherwise.
2392 */
2393int
2394nfsv4_sattr(struct nfsrv_descript *nd, struct nfsvattr *nvap,
2395    nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
2396{
2397	u_int32_t *tl;
2398	int attrsum = 0;
2399	int i, j;
2400	int error, attrsize, bitpos, aclsize, aceerr, retnotsup = 0;
2401	int toclient = 0;
2402	u_char *cp, namestr[NFSV4_SMALLSTR + 1];
2403	uid_t uid;
2404	gid_t gid;
2405
2406	error = nfsrv_getattrbits(nd, attrbitp, NULL, &retnotsup);
2407	if (error)
2408		goto nfsmout;
2409	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2410	attrsize = fxdr_unsigned(int, *tl);
2411
2412	/*
2413	 * Loop around getting the setable attributes. If an unsupported
2414	 * one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return.
2415	 */
2416	if (retnotsup) {
2417		nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2418		bitpos = NFSATTRBIT_MAX;
2419	} else {
2420		bitpos = 0;
2421	}
2422	for (; bitpos < NFSATTRBIT_MAX; bitpos++) {
2423	    if (attrsum > attrsize) {
2424		error = NFSERR_BADXDR;
2425		goto nfsmout;
2426	    }
2427	    if (NFSISSET_ATTRBIT(attrbitp, bitpos))
2428		switch (bitpos) {
2429		case NFSATTRBIT_SIZE:
2430			NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
2431			nvap->na_size = fxdr_hyper(tl);
2432			attrsum += NFSX_HYPER;
2433			break;
2434		case NFSATTRBIT_ACL:
2435			error = nfsrv_dissectacl(nd, aclp, &aceerr, &aclsize,
2436			    p);
2437			if (error)
2438				goto nfsmout;
2439			if (aceerr && !nd->nd_repstat)
2440				nd->nd_repstat = aceerr;
2441			attrsum += aclsize;
2442			break;
2443		case NFSATTRBIT_ARCHIVE:
2444			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2445			if (!nd->nd_repstat)
2446				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2447			attrsum += NFSX_UNSIGNED;
2448			break;
2449		case NFSATTRBIT_HIDDEN:
2450			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2451			if (!nd->nd_repstat)
2452				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2453			attrsum += NFSX_UNSIGNED;
2454			break;
2455		case NFSATTRBIT_MIMETYPE:
2456			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2457			i = fxdr_unsigned(int, *tl);
2458			error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
2459			if (error)
2460				goto nfsmout;
2461			if (!nd->nd_repstat)
2462				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2463			attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i));
2464			break;
2465		case NFSATTRBIT_MODE:
2466			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2467			nvap->na_mode = nfstov_mode(*tl);
2468			attrsum += NFSX_UNSIGNED;
2469			break;
2470		case NFSATTRBIT_OWNER:
2471			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2472			j = fxdr_unsigned(int, *tl);
2473			if (j < 0) {
2474				error = NFSERR_BADXDR;
2475				goto nfsmout;
2476			}
2477			if (j > NFSV4_SMALLSTR)
2478				cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
2479			else
2480				cp = namestr;
2481			error = nfsrv_mtostr(nd, cp, j);
2482			if (error) {
2483				if (j > NFSV4_SMALLSTR)
2484					free(cp, M_NFSSTRING);
2485				goto nfsmout;
2486			}
2487			if (!nd->nd_repstat) {
2488				nd->nd_repstat = nfsv4_strtouid(nd, cp, j, &uid,
2489				    p);
2490				if (!nd->nd_repstat)
2491					nvap->na_uid = uid;
2492			}
2493			if (j > NFSV4_SMALLSTR)
2494				free(cp, M_NFSSTRING);
2495			attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
2496			break;
2497		case NFSATTRBIT_OWNERGROUP:
2498			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2499			j = fxdr_unsigned(int, *tl);
2500			if (j < 0) {
2501				error = NFSERR_BADXDR;
2502				goto nfsmout;
2503			}
2504			if (j > NFSV4_SMALLSTR)
2505				cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
2506			else
2507				cp = namestr;
2508			error = nfsrv_mtostr(nd, cp, j);
2509			if (error) {
2510				if (j > NFSV4_SMALLSTR)
2511					free(cp, M_NFSSTRING);
2512				goto nfsmout;
2513			}
2514			if (!nd->nd_repstat) {
2515				nd->nd_repstat = nfsv4_strtogid(nd, cp, j, &gid,
2516				    p);
2517				if (!nd->nd_repstat)
2518					nvap->na_gid = gid;
2519			}
2520			if (j > NFSV4_SMALLSTR)
2521				free(cp, M_NFSSTRING);
2522			attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
2523			break;
2524		case NFSATTRBIT_SYSTEM:
2525			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2526			if (!nd->nd_repstat)
2527				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2528			attrsum += NFSX_UNSIGNED;
2529			break;
2530		case NFSATTRBIT_TIMEACCESSSET:
2531			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2532			attrsum += NFSX_UNSIGNED;
2533			if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
2534			    NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
2535			    fxdr_nfsv4time(tl, &nvap->na_atime);
2536			    toclient = 1;
2537			    attrsum += NFSX_V4TIME;
2538			} else {
2539			    vfs_timestamp(&nvap->na_atime);
2540			    nvap->na_vaflags |= VA_UTIMES_NULL;
2541			}
2542			break;
2543		case NFSATTRBIT_TIMEBACKUP:
2544			NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
2545			if (!nd->nd_repstat)
2546				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2547			attrsum += NFSX_V4TIME;
2548			break;
2549		case NFSATTRBIT_TIMECREATE:
2550			NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
2551			if (!nd->nd_repstat)
2552				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2553			attrsum += NFSX_V4TIME;
2554			break;
2555		case NFSATTRBIT_TIMEMODIFYSET:
2556			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2557			attrsum += NFSX_UNSIGNED;
2558			if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
2559			    NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
2560			    fxdr_nfsv4time(tl, &nvap->na_mtime);
2561			    nvap->na_vaflags &= ~VA_UTIMES_NULL;
2562			    attrsum += NFSX_V4TIME;
2563			} else {
2564			    vfs_timestamp(&nvap->na_mtime);
2565			    if (!toclient)
2566				nvap->na_vaflags |= VA_UTIMES_NULL;
2567			}
2568			break;
2569		default:
2570			nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2571			/*
2572			 * set bitpos so we drop out of the loop.
2573			 */
2574			bitpos = NFSATTRBIT_MAX;
2575			break;
2576		};
2577	}
2578
2579	/*
2580	 * some clients pad the attrlist, so we need to skip over the
2581	 * padding.
2582	 */
2583	if (attrsum > attrsize) {
2584		error = NFSERR_BADXDR;
2585	} else {
2586		attrsize = NFSM_RNDUP(attrsize);
2587		if (attrsum < attrsize)
2588			error = nfsm_advance(nd, attrsize - attrsum, -1);
2589	}
2590nfsmout:
2591	NFSEXITCODE2(error, nd);
2592	return (error);
2593}
2594
2595/*
2596 * Check/setup export credentials.
2597 */
2598int
2599nfsd_excred(struct nfsrv_descript *nd, struct nfsexstuff *exp,
2600    struct ucred *credanon)
2601{
2602	int error = 0;
2603
2604	/*
2605	 * Check/setup credentials.
2606	 */
2607	if (nd->nd_flag & ND_GSS)
2608		exp->nes_exflag &= ~MNT_EXPORTANON;
2609
2610	/*
2611	 * Check to see if the operation is allowed for this security flavor.
2612	 * RFC2623 suggests that the NFSv3 Fsinfo RPC be allowed to
2613	 * AUTH_NONE or AUTH_SYS for file systems requiring RPCSEC_GSS.
2614	 * Also, allow Secinfo, so that it can acquire the correct flavor(s).
2615	 */
2616	if (nfsvno_testexp(nd, exp) &&
2617	    nd->nd_procnum != NFSV4OP_SECINFO &&
2618	    nd->nd_procnum != NFSPROC_FSINFO) {
2619		if (nd->nd_flag & ND_NFSV4)
2620			error = NFSERR_WRONGSEC;
2621		else
2622			error = (NFSERR_AUTHERR | AUTH_TOOWEAK);
2623		goto out;
2624	}
2625
2626	/*
2627	 * Check to see if the file system is exported V4 only.
2628	 */
2629	if (NFSVNO_EXV4ONLY(exp) && !(nd->nd_flag & ND_NFSV4)) {
2630		error = NFSERR_PROGNOTV4;
2631		goto out;
2632	}
2633
2634	/*
2635	 * Now, map the user credentials.
2636	 * (Note that ND_AUTHNONE will only be set for an NFSv3
2637	 *  Fsinfo RPC. If set for anything else, this code might need
2638	 *  to change.)
2639	 */
2640	if (NFSVNO_EXPORTED(exp) &&
2641	    ((!(nd->nd_flag & ND_GSS) && nd->nd_cred->cr_uid == 0) ||
2642	     NFSVNO_EXPORTANON(exp) ||
2643	     (nd->nd_flag & ND_AUTHNONE))) {
2644		nd->nd_cred->cr_uid = credanon->cr_uid;
2645		nd->nd_cred->cr_gid = credanon->cr_gid;
2646		crsetgroups(nd->nd_cred, credanon->cr_ngroups,
2647		    credanon->cr_groups);
2648	}
2649
2650out:
2651	NFSEXITCODE2(error, nd);
2652	return (error);
2653}
2654
2655/*
2656 * Check exports.
2657 */
2658int
2659nfsvno_checkexp(struct mount *mp, struct sockaddr *nam, struct nfsexstuff *exp,
2660    struct ucred **credp)
2661{
2662	int i, error, *secflavors;
2663
2664	error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
2665	    &exp->nes_numsecflavor, &secflavors);
2666	if (error) {
2667		if (nfs_rootfhset) {
2668			exp->nes_exflag = 0;
2669			exp->nes_numsecflavor = 0;
2670			error = 0;
2671		}
2672	} else {
2673		/* Copy the security flavors. */
2674		for (i = 0; i < exp->nes_numsecflavor; i++)
2675			exp->nes_secflavors[i] = secflavors[i];
2676	}
2677	NFSEXITCODE(error);
2678	return (error);
2679}
2680
2681/*
2682 * Get a vnode for a file handle and export stuff.
2683 */
2684int
2685nfsvno_fhtovp(struct mount *mp, fhandle_t *fhp, struct sockaddr *nam,
2686    int lktype, struct vnode **vpp, struct nfsexstuff *exp,
2687    struct ucred **credp)
2688{
2689	int i, error, *secflavors;
2690
2691	*credp = NULL;
2692	exp->nes_numsecflavor = 0;
2693	if (VFS_NEEDSGIANT(mp))
2694		error = ESTALE;
2695	else
2696		error = VFS_FHTOVP(mp, &fhp->fh_fid, lktype, vpp);
2697	if (error != 0)
2698		/* Make sure the server replies ESTALE to the client. */
2699		error = ESTALE;
2700	if (nam && !error) {
2701		error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
2702		    &exp->nes_numsecflavor, &secflavors);
2703		if (error) {
2704			if (nfs_rootfhset) {
2705				exp->nes_exflag = 0;
2706				exp->nes_numsecflavor = 0;
2707				error = 0;
2708			} else {
2709				vput(*vpp);
2710			}
2711		} else {
2712			/* Copy the security flavors. */
2713			for (i = 0; i < exp->nes_numsecflavor; i++)
2714				exp->nes_secflavors[i] = secflavors[i];
2715		}
2716	}
2717	NFSEXITCODE(error);
2718	return (error);
2719}
2720
2721/*
2722 * nfsd_fhtovp() - convert a fh to a vnode ptr
2723 * 	- look up fsid in mount list (if not found ret error)
2724 *	- get vp and export rights by calling nfsvno_fhtovp()
2725 *	- if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
2726 *	  for AUTH_SYS
2727 *	- if mpp != NULL, return the mount point so that it can
2728 *	  be used for vn_finished_write() by the caller
2729 */
2730void
2731nfsd_fhtovp(struct nfsrv_descript *nd, struct nfsrvfh *nfp, int lktype,
2732    struct vnode **vpp, struct nfsexstuff *exp,
2733    struct mount **mpp, int startwrite, struct thread *p)
2734{
2735	struct mount *mp;
2736	struct ucred *credanon;
2737	fhandle_t *fhp;
2738
2739	fhp = (fhandle_t *)nfp->nfsrvfh_data;
2740	/*
2741	 * Check for the special case of the nfsv4root_fh.
2742	 */
2743	mp = vfs_busyfs(&fhp->fh_fsid);
2744	if (mpp != NULL)
2745		*mpp = mp;
2746	if (mp == NULL) {
2747		*vpp = NULL;
2748		nd->nd_repstat = ESTALE;
2749		goto out;
2750	}
2751
2752	if (startwrite) {
2753		vn_start_write(NULL, mpp, V_WAIT);
2754		if (lktype == LK_SHARED && !(MNT_SHARED_WRITES(mp)))
2755			lktype = LK_EXCLUSIVE;
2756	}
2757	nd->nd_repstat = nfsvno_fhtovp(mp, fhp, nd->nd_nam, lktype, vpp, exp,
2758	    &credanon);
2759	vfs_unbusy(mp);
2760
2761	/*
2762	 * For NFSv4 without a pseudo root fs, unexported file handles
2763	 * can be returned, so that Lookup works everywhere.
2764	 */
2765	if (!nd->nd_repstat && exp->nes_exflag == 0 &&
2766	    !(nd->nd_flag & ND_NFSV4)) {
2767		vput(*vpp);
2768		nd->nd_repstat = EACCES;
2769	}
2770
2771	/*
2772	 * Personally, I've never seen any point in requiring a
2773	 * reserved port#, since only in the rare case where the
2774	 * clients are all boxes with secure system priviledges,
2775	 * does it provide any enhanced security, but... some people
2776	 * believe it to be useful and keep putting this code back in.
2777	 * (There is also some "security checker" out there that
2778	 *  complains if the nfs server doesn't enforce this.)
2779	 * However, note the following:
2780	 * RFC3530 (NFSv4) specifies that a reserved port# not be
2781	 *	required.
2782	 * RFC2623 recommends that, if a reserved port# is checked for,
2783	 *	that there be a way to turn that off--> ifdef'd.
2784	 */
2785#ifdef NFS_REQRSVPORT
2786	if (!nd->nd_repstat) {
2787		struct sockaddr_in *saddr;
2788		struct sockaddr_in6 *saddr6;
2789
2790		saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
2791		saddr6 = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in6 *);
2792		if (!(nd->nd_flag & ND_NFSV4) &&
2793		    ((saddr->sin_family == AF_INET &&
2794		      ntohs(saddr->sin_port) >= IPPORT_RESERVED) ||
2795		     (saddr6->sin6_family == AF_INET6 &&
2796		      ntohs(saddr6->sin6_port) >= IPPORT_RESERVED))) {
2797			vput(*vpp);
2798			nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK);
2799		}
2800	}
2801#endif	/* NFS_REQRSVPORT */
2802
2803	/*
2804	 * Check/setup credentials.
2805	 */
2806	if (!nd->nd_repstat) {
2807		nd->nd_saveduid = nd->nd_cred->cr_uid;
2808		nd->nd_repstat = nfsd_excred(nd, exp, credanon);
2809		if (nd->nd_repstat)
2810			vput(*vpp);
2811	}
2812	if (credanon != NULL)
2813		crfree(credanon);
2814	if (nd->nd_repstat) {
2815		if (startwrite)
2816			vn_finished_write(mp);
2817		*vpp = NULL;
2818		if (mpp != NULL)
2819			*mpp = NULL;
2820	}
2821
2822out:
2823	NFSEXITCODE2(0, nd);
2824}
2825
2826/*
2827 * glue for fp.
2828 */
2829int
2830fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp)
2831{
2832	struct filedesc *fdp;
2833	struct file *fp;
2834	int error = 0;
2835
2836	fdp = p->td_proc->p_fd;
2837	if (fd >= fdp->fd_nfiles ||
2838	    (fp = fdp->fd_ofiles[fd]) == NULL) {
2839		error = EBADF;
2840		goto out;
2841	}
2842	*fpp = fp;
2843
2844out:
2845	NFSEXITCODE(error);
2846	return (error);
2847}
2848
2849/*
2850 * Called from nfssvc() to update the exports list. Just call
2851 * vfs_export(). This has to be done, since the v4 root fake fs isn't
2852 * in the mount list.
2853 */
2854int
2855nfsrv_v4rootexport(void *argp, struct ucred *cred, struct thread *p)
2856{
2857	struct nfsex_args *nfsexargp = (struct nfsex_args *)argp;
2858	int error = 0;
2859	struct nameidata nd;
2860	fhandle_t fh;
2861
2862	error = vfs_export(&nfsv4root_mnt, &nfsexargp->export);
2863	if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0)
2864		nfs_rootfhset = 0;
2865	else if (error == 0) {
2866		if (nfsexargp->fspec == NULL) {
2867			error = EPERM;
2868			goto out;
2869		}
2870		/*
2871		 * If fspec != NULL, this is the v4root path.
2872		 */
2873		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_USERSPACE,
2874		    nfsexargp->fspec, p);
2875		if ((error = namei(&nd)) != 0)
2876			goto out;
2877		error = nfsvno_getfh(nd.ni_vp, &fh, p);
2878		vrele(nd.ni_vp);
2879		if (!error) {
2880			nfs_rootfh.nfsrvfh_len = NFSX_MYFH;
2881			NFSBCOPY((caddr_t)&fh,
2882			    nfs_rootfh.nfsrvfh_data,
2883			    sizeof (fhandle_t));
2884			nfs_rootfhset = 1;
2885		}
2886	}
2887
2888out:
2889	NFSEXITCODE(error);
2890	return (error);
2891}
2892
2893/*
2894 * This function needs to test to see if the system is near its limit
2895 * for memory allocation via malloc() or mget() and return True iff
2896 * either of these resources are near their limit.
2897 * XXX (For now, this is just a stub.)
2898 */
2899int nfsrv_testmalloclimit = 0;
2900int
2901nfsrv_mallocmget_limit(void)
2902{
2903	static int printmesg = 0;
2904	static int testval = 1;
2905
2906	if (nfsrv_testmalloclimit && (testval++ % 1000) == 0) {
2907		if ((printmesg++ % 100) == 0)
2908			printf("nfsd: malloc/mget near limit\n");
2909		return (1);
2910	}
2911	return (0);
2912}
2913
2914/*
2915 * BSD specific initialization of a mount point.
2916 */
2917void
2918nfsd_mntinit(void)
2919{
2920	static int inited = 0;
2921
2922	if (inited)
2923		return;
2924	inited = 1;
2925	nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED);
2926	TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist);
2927	TAILQ_INIT(&nfsv4root_mnt.mnt_activevnodelist);
2928	nfsv4root_mnt.mnt_export = NULL;
2929	TAILQ_INIT(&nfsv4root_opt);
2930	TAILQ_INIT(&nfsv4root_newopt);
2931	nfsv4root_mnt.mnt_opt = &nfsv4root_opt;
2932	nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt;
2933	nfsv4root_mnt.mnt_nvnodelistsize = 0;
2934	nfsv4root_mnt.mnt_activevnodelistsize = 0;
2935}
2936
2937/*
2938 * Get a vnode for a file handle, without checking exports, etc.
2939 */
2940struct vnode *
2941nfsvno_getvp(fhandle_t *fhp)
2942{
2943	struct mount *mp;
2944	struct vnode *vp;
2945	int error;
2946
2947	mp = vfs_busyfs(&fhp->fh_fsid);
2948	if (mp == NULL)
2949		return (NULL);
2950	error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, &vp);
2951	vfs_unbusy(mp);
2952	if (error)
2953		return (NULL);
2954	return (vp);
2955}
2956
2957/*
2958 * Do a local VOP_ADVLOCK().
2959 */
2960int
2961nfsvno_advlock(struct vnode *vp, int ftype, u_int64_t first,
2962    u_int64_t end, struct thread *td)
2963{
2964	int error = 0;
2965	struct flock fl;
2966	u_int64_t tlen;
2967
2968	if (nfsrv_dolocallocks == 0)
2969		goto out;
2970
2971	/* Check for VI_DOOMED here, so that VOP_ADVLOCK() isn't performed. */
2972	if ((vp->v_iflag & VI_DOOMED) != 0) {
2973		error = EPERM;
2974		goto out;
2975	}
2976
2977	fl.l_whence = SEEK_SET;
2978	fl.l_type = ftype;
2979	fl.l_start = (off_t)first;
2980	if (end == NFS64BITSSET) {
2981		fl.l_len = 0;
2982	} else {
2983		tlen = end - first;
2984		fl.l_len = (off_t)tlen;
2985	}
2986	/*
2987	 * For FreeBSD8, the l_pid and l_sysid must be set to the same
2988	 * values for all calls, so that all locks will be held by the
2989	 * nfsd server. (The nfsd server handles conflicts between the
2990	 * various clients.)
2991	 * Since an NFSv4 lockowner is a ClientID plus an array of up to 1024
2992	 * bytes, so it can't be put in l_sysid.
2993	 */
2994	if (nfsv4_sysid == 0)
2995		nfsv4_sysid = nlm_acquire_next_sysid();
2996	fl.l_pid = (pid_t)0;
2997	fl.l_sysid = (int)nfsv4_sysid;
2998
2999	NFSVOPUNLOCK(vp, 0);
3000	if (ftype == F_UNLCK)
3001		error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_UNLCK, &fl,
3002		    (F_POSIX | F_REMOTE));
3003	else
3004		error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_SETLK, &fl,
3005		    (F_POSIX | F_REMOTE));
3006	NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
3007
3008out:
3009	NFSEXITCODE(error);
3010	return (error);
3011}
3012
3013/*
3014 * Check the nfsv4 root exports.
3015 */
3016int
3017nfsvno_v4rootexport(struct nfsrv_descript *nd)
3018{
3019	struct ucred *credanon;
3020	int exflags, error = 0, numsecflavor, *secflavors, i;
3021
3022	error = vfs_stdcheckexp(&nfsv4root_mnt, nd->nd_nam, &exflags,
3023	    &credanon, &numsecflavor, &secflavors);
3024	if (error) {
3025		error = NFSERR_PROGUNAVAIL;
3026		goto out;
3027	}
3028	if (credanon != NULL)
3029		crfree(credanon);
3030	for (i = 0; i < numsecflavor; i++) {
3031		if (secflavors[i] == AUTH_SYS)
3032			nd->nd_flag |= ND_EXAUTHSYS;
3033		else if (secflavors[i] == RPCSEC_GSS_KRB5)
3034			nd->nd_flag |= ND_EXGSS;
3035		else if (secflavors[i] == RPCSEC_GSS_KRB5I)
3036			nd->nd_flag |= ND_EXGSSINTEGRITY;
3037		else if (secflavors[i] == RPCSEC_GSS_KRB5P)
3038			nd->nd_flag |= ND_EXGSSPRIVACY;
3039	}
3040
3041out:
3042	NFSEXITCODE(error);
3043	return (error);
3044}
3045
3046/*
3047 * Nfs server psuedo system call for the nfsd's
3048 */
3049/*
3050 * MPSAFE
3051 */
3052static int
3053nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap)
3054{
3055	struct file *fp;
3056	struct nfsd_addsock_args sockarg;
3057	struct nfsd_nfsd_args nfsdarg;
3058	int error;
3059
3060	if (uap->flag & NFSSVC_NFSDADDSOCK) {
3061		error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg));
3062		if (error)
3063			goto out;
3064		/*
3065		 * Since we don't know what rights might be required,
3066		 * pretend that we need them all. It is better to be too
3067		 * careful than too reckless.
3068		 */
3069		if ((error = fget(td, sockarg.sock, CAP_SOCK_ALL, &fp)) != 0)
3070			goto out;
3071		if (fp->f_type != DTYPE_SOCKET) {
3072			fdrop(fp, td);
3073			error = EPERM;
3074			goto out;
3075		}
3076		error = nfsrvd_addsock(fp);
3077		fdrop(fp, td);
3078	} else if (uap->flag & NFSSVC_NFSDNFSD) {
3079		if (uap->argp == NULL) {
3080			error = EINVAL;
3081			goto out;
3082		}
3083		error = copyin(uap->argp, (caddr_t)&nfsdarg,
3084		    sizeof (nfsdarg));
3085		if (error)
3086			goto out;
3087		error = nfsrvd_nfsd(td, &nfsdarg);
3088	} else {
3089		error = nfssvc_srvcall(td, uap, td->td_ucred);
3090	}
3091
3092out:
3093	NFSEXITCODE(error);
3094	return (error);
3095}
3096
3097static int
3098nfssvc_srvcall(struct thread *p, struct nfssvc_args *uap, struct ucred *cred)
3099{
3100	struct nfsex_args export;
3101	struct file *fp = NULL;
3102	int stablefd, len;
3103	struct nfsd_clid adminrevoke;
3104	struct nfsd_dumplist dumplist;
3105	struct nfsd_dumpclients *dumpclients;
3106	struct nfsd_dumplocklist dumplocklist;
3107	struct nfsd_dumplocks *dumplocks;
3108	struct nameidata nd;
3109	vnode_t vp;
3110	int error = EINVAL, igotlock;
3111	struct proc *procp;
3112	static int suspend_nfsd = 0;
3113
3114	if (uap->flag & NFSSVC_PUBLICFH) {
3115		NFSBZERO((caddr_t)&nfs_pubfh.nfsrvfh_data,
3116		    sizeof (fhandle_t));
3117		error = copyin(uap->argp,
3118		    &nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t));
3119		if (!error)
3120			nfs_pubfhset = 1;
3121	} else if (uap->flag & NFSSVC_V4ROOTEXPORT) {
3122		error = copyin(uap->argp,(caddr_t)&export,
3123		    sizeof (struct nfsex_args));
3124		if (!error)
3125			error = nfsrv_v4rootexport(&export, cred, p);
3126	} else if (uap->flag & NFSSVC_NOPUBLICFH) {
3127		nfs_pubfhset = 0;
3128		error = 0;
3129	} else if (uap->flag & NFSSVC_STABLERESTART) {
3130		error = copyin(uap->argp, (caddr_t)&stablefd,
3131		    sizeof (int));
3132		if (!error)
3133			error = fp_getfvp(p, stablefd, &fp, &vp);
3134		if (!error && (NFSFPFLAG(fp) & (FREAD | FWRITE)) != (FREAD | FWRITE))
3135			error = EBADF;
3136		if (!error && newnfs_numnfsd != 0)
3137			error = EPERM;
3138		if (!error) {
3139			nfsrv_stablefirst.nsf_fp = fp;
3140			nfsrv_setupstable(p);
3141		}
3142	} else if (uap->flag & NFSSVC_ADMINREVOKE) {
3143		error = copyin(uap->argp, (caddr_t)&adminrevoke,
3144		    sizeof (struct nfsd_clid));
3145		if (!error)
3146			error = nfsrv_adminrevoke(&adminrevoke, p);
3147	} else if (uap->flag & NFSSVC_DUMPCLIENTS) {
3148		error = copyin(uap->argp, (caddr_t)&dumplist,
3149		    sizeof (struct nfsd_dumplist));
3150		if (!error && (dumplist.ndl_size < 1 ||
3151			dumplist.ndl_size > NFSRV_MAXDUMPLIST))
3152			error = EPERM;
3153		if (!error) {
3154		    len = sizeof (struct nfsd_dumpclients) * dumplist.ndl_size;
3155		    dumpclients = (struct nfsd_dumpclients *)malloc(len,
3156			M_TEMP, M_WAITOK);
3157		    nfsrv_dumpclients(dumpclients, dumplist.ndl_size);
3158		    error = copyout(dumpclients,
3159			CAST_USER_ADDR_T(dumplist.ndl_list), len);
3160		    free((caddr_t)dumpclients, M_TEMP);
3161		}
3162	} else if (uap->flag & NFSSVC_DUMPLOCKS) {
3163		error = copyin(uap->argp, (caddr_t)&dumplocklist,
3164		    sizeof (struct nfsd_dumplocklist));
3165		if (!error && (dumplocklist.ndllck_size < 1 ||
3166			dumplocklist.ndllck_size > NFSRV_MAXDUMPLIST))
3167			error = EPERM;
3168		if (!error)
3169			error = nfsrv_lookupfilename(&nd,
3170				dumplocklist.ndllck_fname, p);
3171		if (!error) {
3172			len = sizeof (struct nfsd_dumplocks) *
3173				dumplocklist.ndllck_size;
3174			dumplocks = (struct nfsd_dumplocks *)malloc(len,
3175				M_TEMP, M_WAITOK);
3176			nfsrv_dumplocks(nd.ni_vp, dumplocks,
3177			    dumplocklist.ndllck_size, p);
3178			vput(nd.ni_vp);
3179			error = copyout(dumplocks,
3180			    CAST_USER_ADDR_T(dumplocklist.ndllck_list), len);
3181			free((caddr_t)dumplocks, M_TEMP);
3182		}
3183	} else if (uap->flag & NFSSVC_BACKUPSTABLE) {
3184		procp = p->td_proc;
3185		PROC_LOCK(procp);
3186		nfsd_master_pid = procp->p_pid;
3187		bcopy(procp->p_comm, nfsd_master_comm, MAXCOMLEN + 1);
3188		nfsd_master_start = procp->p_stats->p_start;
3189		nfsd_master_proc = procp;
3190		PROC_UNLOCK(procp);
3191	} else if ((uap->flag & NFSSVC_SUSPENDNFSD) != 0) {
3192		NFSLOCKV4ROOTMUTEX();
3193		if (suspend_nfsd == 0) {
3194			/* Lock out all nfsd threads */
3195			do {
3196				igotlock = nfsv4_lock(&nfsd_suspend_lock, 1,
3197				    NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
3198			} while (igotlock == 0 && suspend_nfsd == 0);
3199			suspend_nfsd = 1;
3200		}
3201		NFSUNLOCKV4ROOTMUTEX();
3202		error = 0;
3203	} else if ((uap->flag & NFSSVC_RESUMENFSD) != 0) {
3204		NFSLOCKV4ROOTMUTEX();
3205		if (suspend_nfsd != 0) {
3206			nfsv4_unlock(&nfsd_suspend_lock, 0);
3207			suspend_nfsd = 0;
3208		}
3209		NFSUNLOCKV4ROOTMUTEX();
3210		error = 0;
3211	}
3212
3213	NFSEXITCODE(error);
3214	return (error);
3215}
3216
3217/*
3218 * Check exports.
3219 * Returns 0 if ok, 1 otherwise.
3220 */
3221int
3222nfsvno_testexp(struct nfsrv_descript *nd, struct nfsexstuff *exp)
3223{
3224	int i;
3225
3226	/*
3227	 * This seems odd, but allow the case where the security flavor
3228	 * list is empty. This happens when NFSv4 is traversing non-exported
3229	 * file systems. Exported file systems should always have a non-empty
3230	 * security flavor list.
3231	 */
3232	if (exp->nes_numsecflavor == 0)
3233		return (0);
3234
3235	for (i = 0; i < exp->nes_numsecflavor; i++) {
3236		/*
3237		 * The tests for privacy and integrity must be first,
3238		 * since ND_GSS is set for everything but AUTH_SYS.
3239		 */
3240		if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5P &&
3241		    (nd->nd_flag & ND_GSSPRIVACY))
3242			return (0);
3243		if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5I &&
3244		    (nd->nd_flag & ND_GSSINTEGRITY))
3245			return (0);
3246		if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5 &&
3247		    (nd->nd_flag & ND_GSS))
3248			return (0);
3249		if (exp->nes_secflavors[i] == AUTH_SYS &&
3250		    (nd->nd_flag & ND_GSS) == 0)
3251			return (0);
3252	}
3253	return (1);
3254}
3255
3256/*
3257 * Calculate a hash value for the fid in a file handle.
3258 */
3259uint32_t
3260nfsrv_hashfh(fhandle_t *fhp)
3261{
3262	uint32_t hashval;
3263
3264	hashval = hash32_buf(&fhp->fh_fid, sizeof(struct fid), 0);
3265	return (hashval);
3266}
3267
3268/*
3269 * Signal the userland master nfsd to backup the stable restart file.
3270 */
3271void
3272nfsrv_backupstable(void)
3273{
3274	struct proc *procp;
3275
3276	if (nfsd_master_proc != NULL) {
3277		procp = pfind(nfsd_master_pid);
3278		/* Try to make sure it is the correct process. */
3279		if (procp == nfsd_master_proc &&
3280		    procp->p_stats->p_start.tv_sec ==
3281		    nfsd_master_start.tv_sec &&
3282		    procp->p_stats->p_start.tv_usec ==
3283		    nfsd_master_start.tv_usec &&
3284		    strcmp(procp->p_comm, nfsd_master_comm) == 0)
3285			kern_psignal(procp, SIGUSR2);
3286		else
3287			nfsd_master_proc = NULL;
3288
3289		if (procp != NULL)
3290			PROC_UNLOCK(procp);
3291	}
3292}
3293
3294extern int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *);
3295
3296/*
3297 * Called once to initialize data structures...
3298 */
3299static int
3300nfsd_modevent(module_t mod, int type, void *data)
3301{
3302	int error = 0, i;
3303	static int loaded = 0;
3304
3305	switch (type) {
3306	case MOD_LOAD:
3307		if (loaded)
3308			goto out;
3309		newnfs_portinit();
3310		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
3311			snprintf(nfsrchash_table[i].lock_name,
3312			    sizeof(nfsrchash_table[i].lock_name), "nfsrc_tcp%d",
3313			    i);
3314			mtx_init(&nfsrchash_table[i].mtx,
3315			    nfsrchash_table[i].lock_name, NULL, MTX_DEF);
3316			snprintf(nfsrcahash_table[i].lock_name,
3317			    sizeof(nfsrcahash_table[i].lock_name), "nfsrc_tcpa%d",
3318			    i);
3319			mtx_init(&nfsrcahash_table[i].mtx,
3320			    nfsrcahash_table[i].lock_name, NULL, MTX_DEF);
3321		}
3322		mtx_init(&nfsrc_udpmtx, "nfs_udpcache_mutex", NULL, MTX_DEF);
3323		mtx_init(&nfs_v4root_mutex, "nfs_v4root_mutex", NULL, MTX_DEF);
3324		mtx_init(&nfsv4root_mnt.mnt_mtx, "struct mount mtx", NULL,
3325		    MTX_DEF);
3326		lockinit(&nfsv4root_mnt.mnt_explock, PVFS, "explock", 0, 0);
3327		nfsrvd_initcache();
3328		nfsd_init();
3329		NFSD_LOCK();
3330		nfsrvd_init(0);
3331		NFSD_UNLOCK();
3332		nfsd_mntinit();
3333#ifdef VV_DISABLEDELEG
3334		vn_deleg_ops.vndeleg_recall = nfsd_recalldelegation;
3335		vn_deleg_ops.vndeleg_disable = nfsd_disabledelegation;
3336#endif
3337		nfsd_call_servertimer = nfsrv_servertimer;
3338		nfsd_call_nfsd = nfssvc_nfsd;
3339		loaded = 1;
3340		break;
3341
3342	case MOD_UNLOAD:
3343		if (newnfs_numnfsd != 0) {
3344			error = EBUSY;
3345			break;
3346		}
3347
3348#ifdef VV_DISABLEDELEG
3349		vn_deleg_ops.vndeleg_recall = NULL;
3350		vn_deleg_ops.vndeleg_disable = NULL;
3351#endif
3352		nfsd_call_servertimer = NULL;
3353		nfsd_call_nfsd = NULL;
3354
3355		/* Clean out all NFSv4 state. */
3356		nfsrv_throwawayallstate(curthread);
3357
3358		/* Clean the NFS server reply cache */
3359		nfsrvd_cleancache();
3360
3361		/* Free up the krpc server pool. */
3362		if (nfsrvd_pool != NULL)
3363			svcpool_destroy(nfsrvd_pool);
3364
3365		/* and get rid of the locks */
3366		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
3367			mtx_destroy(&nfsrchash_table[i].mtx);
3368			mtx_destroy(&nfsrcahash_table[i].mtx);
3369		}
3370		mtx_destroy(&nfsrc_udpmtx);
3371		mtx_destroy(&nfs_v4root_mutex);
3372		mtx_destroy(&nfsv4root_mnt.mnt_mtx);
3373		lockdestroy(&nfsv4root_mnt.mnt_explock);
3374		loaded = 0;
3375		break;
3376	default:
3377		error = EOPNOTSUPP;
3378		break;
3379	}
3380
3381out:
3382	NFSEXITCODE(error);
3383	return (error);
3384}
3385static moduledata_t nfsd_mod = {
3386	"nfsd",
3387	nfsd_modevent,
3388	NULL,
3389};
3390DECLARE_MODULE(nfsd, nfsd_mod, SI_SUB_VFS, SI_ORDER_ANY);
3391
3392/* So that loader and kldload(2) can find us, wherever we are.. */
3393MODULE_VERSION(nfsd, 1);
3394MODULE_DEPEND(nfsd, nfscommon, 1, 1, 1);
3395MODULE_DEPEND(nfsd, nfslock, 1, 1, 1);
3396MODULE_DEPEND(nfsd, nfslockd, 1, 1, 1);
3397MODULE_DEPEND(nfsd, krpc, 1, 1, 1);
3398MODULE_DEPEND(nfsd, nfssvc, 1, 1, 1);
3399
3400