nfs_vfsops.c revision 3898:c788126f2a20
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 *
25 *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
26 *	All rights reserved.
27 */
28
29#pragma ident	"%Z%%M%	%I%	%E% SMI"
30
31#include <sys/param.h>
32#include <sys/types.h>
33#include <sys/systm.h>
34#include <sys/cred.h>
35#include <sys/vfs.h>
36#include <sys/vfs_opreg.h>
37#include <sys/vnode.h>
38#include <sys/pathname.h>
39#include <sys/sysmacros.h>
40#include <sys/kmem.h>
41#include <sys/mkdev.h>
42#include <sys/mount.h>
43#include <sys/mntent.h>
44#include <sys/statvfs.h>
45#include <sys/errno.h>
46#include <sys/debug.h>
47#include <sys/cmn_err.h>
48#include <sys/utsname.h>
49#include <sys/bootconf.h>
50#include <sys/modctl.h>
51#include <sys/acl.h>
52#include <sys/flock.h>
53#include <sys/policy.h>
54#include <sys/zone.h>
55#include <sys/class.h>
56#include <sys/socket.h>
57#include <sys/netconfig.h>
58#include <sys/mntent.h>
59#include <sys/tsol/label.h>
60
61#include <rpc/types.h>
62#include <rpc/auth.h>
63#include <rpc/clnt.h>
64
65#include <nfs/nfs.h>
66#include <nfs/nfs_clnt.h>
67#include <nfs/rnode.h>
68#include <nfs/mount.h>
69#include <nfs/nfs_acl.h>
70
71#include <fs/fs_subr.h>
72
73/*
74 * From rpcsec module (common/rpcsec).
75 */
76extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
77extern void sec_clnt_freeinfo(struct sec_data *);
78
79static int pathconf_copyin(struct nfs_args *, struct pathcnf *);
80static int pathconf_get(struct mntinfo *, struct nfs_args *);
81static void pathconf_rele(struct mntinfo *);
82
83/*
84 * The order and contents of this structure must be kept in sync with that of
85 * rfsreqcnt_v2_tmpl in nfs_stats.c
86 */
87static char *rfsnames_v2[] = {
88	"null", "getattr", "setattr", "unused", "lookup", "readlink", "read",
89	"unused", "write", "create", "remove", "rename", "link", "symlink",
90	"mkdir", "rmdir", "readdir", "fsstat"
91};
92
93/*
94 * This table maps from NFS protocol number into call type.
95 * Zero means a "Lookup" type call
96 * One  means a "Read" type call
97 * Two  means a "Write" type call
98 * This is used to select a default time-out.
99 */
100static uchar_t call_type_v2[] = {
101	0, 0, 1, 0, 0, 0, 1,
102	0, 2, 2, 2, 2, 2, 2,
103	2, 2, 1, 0
104};
105
106/*
107 * Similar table, but to determine which timer to use
108 * (only real reads and writes!)
109 */
110static uchar_t timer_type_v2[] = {
111	0, 0, 0, 0, 0, 0, 1,
112	0, 2, 0, 0, 0, 0, 0,
113	0, 0, 1, 0
114};
115
116/*
117 * This table maps from NFS protocol number into a call type
118 * for the semisoft mount option.
119 * Zero means do not repeat operation.
120 * One  means repeat.
121 */
122static uchar_t ss_call_type_v2[] = {
123	0, 0, 1, 0, 0, 0, 0,
124	0, 1, 1, 1, 1, 1, 1,
125	1, 1, 0, 0
126};
127
128/*
129 * nfs vfs operations.
130 */
131static int	nfs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
132static int	nfs_unmount(vfs_t *, int, cred_t *);
133static int	nfs_root(vfs_t *, vnode_t **);
134static int	nfs_statvfs(vfs_t *, struct statvfs64 *);
135static int	nfs_sync(vfs_t *, short, cred_t *);
136static int	nfs_vget(vfs_t *, vnode_t **, fid_t *);
137static int	nfs_mountroot(vfs_t *, whymountroot_t);
138static void	nfs_freevfs(vfs_t *);
139
140static int	nfsrootvp(vnode_t **, vfs_t *, struct servinfo *,
141		    int, cred_t *, zone_t *);
142
143/*
144 * Initialize the vfs structure
145 */
146
147int nfsfstyp;
148vfsops_t *nfs_vfsops;
149
150/*
151 * Debug variable to check for rdma based
152 * transport startup and cleanup. Controlled
153 * through /etc/system. Off by default.
154 */
155int rdma_debug = 0;
156
157int
158nfsinit(int fstyp, char *name)
159{
160	static const fs_operation_def_t nfs_vfsops_template[] = {
161		VFSNAME_MOUNT,		{ .vfs_mount = nfs_mount },
162		VFSNAME_UNMOUNT,	{ .vfs_unmount = nfs_unmount },
163		VFSNAME_ROOT,		{ .vfs_root = nfs_root },
164		VFSNAME_STATVFS,	{ .vfs_statvfs = nfs_statvfs },
165		VFSNAME_SYNC,		{ .vfs_sync = nfs_sync },
166		VFSNAME_VGET,		{ .vfs_vget = nfs_vget },
167		VFSNAME_MOUNTROOT,	{ .vfs_mountroot = nfs_mountroot },
168		VFSNAME_FREEVFS,	{ .vfs_freevfs = nfs_freevfs },
169		NULL,			NULL
170	};
171	int error;
172
173	error = vfs_setfsops(fstyp, nfs_vfsops_template, &nfs_vfsops);
174	if (error != 0) {
175		zcmn_err(GLOBAL_ZONEID, CE_WARN,
176		    "nfsinit: bad vfs ops template");
177		return (error);
178	}
179
180	error = vn_make_ops(name, nfs_vnodeops_template, &nfs_vnodeops);
181	if (error != 0) {
182		(void) vfs_freevfsops_by_type(fstyp);
183		zcmn_err(GLOBAL_ZONEID, CE_WARN,
184		    "nfsinit: bad vnode ops template");
185		return (error);
186	}
187
188
189	nfsfstyp = fstyp;
190
191	return (0);
192}
193
194void
195nfsfini(void)
196{
197}
198
199static void
200nfs_free_args(struct nfs_args *nargs, nfs_fhandle *fh)
201{
202
203	if (fh)
204		kmem_free(fh, sizeof (*fh));
205
206	if (nargs->pathconf) {
207		kmem_free(nargs->pathconf, sizeof (struct pathcnf));
208		nargs->pathconf = NULL;
209	}
210
211	if (nargs->knconf) {
212		if (nargs->knconf->knc_protofmly)
213			kmem_free(nargs->knconf->knc_protofmly,
214				KNC_STRSIZE);
215		if (nargs->knconf->knc_proto)
216			kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
217		kmem_free(nargs->knconf, sizeof (*nargs->knconf));
218		nargs->knconf = NULL;
219	}
220
221	if (nargs->fh) {
222		kmem_free(nargs->fh, strlen(nargs->fh) + 1);
223		nargs->fh = NULL;
224	}
225
226	if (nargs->hostname) {
227		kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
228		nargs->hostname = NULL;
229	}
230
231	if (nargs->addr) {
232		if (nargs->addr->buf) {
233			ASSERT(nargs->addr->len);
234			kmem_free(nargs->addr->buf, nargs->addr->len);
235		}
236		kmem_free(nargs->addr, sizeof (struct netbuf));
237		nargs->addr = NULL;
238	}
239
240	if (nargs->syncaddr) {
241		ASSERT(nargs->syncaddr->len);
242		if (nargs->syncaddr->buf) {
243			ASSERT(nargs->syncaddr->len);
244			kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
245		}
246		kmem_free(nargs->syncaddr, sizeof (struct netbuf));
247		nargs->syncaddr = NULL;
248	}
249
250	if (nargs->netname) {
251		kmem_free(nargs->netname, strlen(nargs->netname) + 1);
252		nargs->netname = NULL;
253	}
254
255	if (nargs->nfs_ext_u.nfs_extA.secdata) {
256		sec_clnt_freeinfo(
257			nargs->nfs_ext_u.nfs_extA.secdata);
258		nargs->nfs_ext_u.nfs_extA.secdata = NULL;
259	}
260}
261
262static int
263nfs_copyin(char *data, int datalen, struct nfs_args *nargs, nfs_fhandle *fh)
264{
265
266	int error;
267	size_t nlen;			/* length of netname */
268	size_t hlen;			/* length of hostname */
269	char netname[MAXNETNAMELEN+1];	/* server's netname */
270	struct netbuf addr;		/* server's address */
271	struct netbuf syncaddr;		/* AUTH_DES time sync addr */
272	struct knetconfig *knconf;	/* transport knetconfig structure */
273	struct sec_data *secdata = NULL;	/* security data */
274	STRUCT_DECL(nfs_args, args);		/* nfs mount arguments */
275	STRUCT_DECL(knetconfig, knconf_tmp);
276	STRUCT_DECL(netbuf, addr_tmp);
277	int flags;
278	struct pathcnf	*pc;		/* Pathconf */
279	char *p, *pf;
280	char *userbufptr;
281
282
283	bzero(nargs, sizeof (*nargs));
284
285	STRUCT_INIT(args, get_udatamodel());
286	bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
287	if (copyin(data, STRUCT_BUF(args), MIN(datalen,
288		STRUCT_SIZE(args))))
289		return (EFAULT);
290
291	nargs->wsize = STRUCT_FGET(args, wsize);
292	nargs->rsize = STRUCT_FGET(args, rsize);
293	nargs->timeo = STRUCT_FGET(args, timeo);
294	nargs->retrans = STRUCT_FGET(args, retrans);
295	nargs->acregmin = STRUCT_FGET(args, acregmin);
296	nargs->acregmax = STRUCT_FGET(args, acregmax);
297	nargs->acdirmin = STRUCT_FGET(args, acdirmin);
298	nargs->acdirmax = STRUCT_FGET(args, acdirmax);
299
300	flags = STRUCT_FGET(args, flags);
301	nargs->flags = flags;
302
303
304	addr.buf = NULL;
305	syncaddr.buf = NULL;
306
307	/*
308	 * Allocate space for a knetconfig structure and
309	 * its strings and copy in from user-land.
310	 */
311	knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
312	STRUCT_INIT(knconf_tmp, get_udatamodel());
313	if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
314		STRUCT_SIZE(knconf_tmp))) {
315		kmem_free(knconf, sizeof (*knconf));
316		return (EFAULT);
317	}
318
319	knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
320	knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
321	knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
322	if (get_udatamodel() != DATAMODEL_LP64) {
323		knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
324	} else {
325		knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
326	}
327
328	pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
329	p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
330	error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
331	if (error) {
332		kmem_free(pf, KNC_STRSIZE);
333		kmem_free(p, KNC_STRSIZE);
334		kmem_free(knconf, sizeof (*knconf));
335		return (error);
336	}
337
338	error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
339	if (error) {
340		kmem_free(pf, KNC_STRSIZE);
341		kmem_free(p, KNC_STRSIZE);
342		kmem_free(knconf, sizeof (*knconf));
343		return (error);
344	}
345
346
347	knconf->knc_protofmly = pf;
348	knconf->knc_proto = p;
349
350	nargs->knconf = knconf;
351
352	/* Copyin pathconf if there is one */
353	if (STRUCT_FGETP(args, pathconf) != NULL) {
354		pc = kmem_alloc(sizeof (*pc), KM_SLEEP);
355		error = pathconf_copyin(STRUCT_BUF(args), pc);
356		nargs->pathconf = pc;
357		if (error)
358			goto errout;
359	}
360
361	/*
362	 * Get server address
363	 */
364	STRUCT_INIT(addr_tmp, get_udatamodel());
365	if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
366		STRUCT_SIZE(addr_tmp))) {
367		error = EFAULT;
368		goto errout;
369	}
370	nargs->addr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
371	userbufptr = STRUCT_FGETP(addr_tmp, buf);
372	addr.len = STRUCT_FGET(addr_tmp, len);
373	addr.buf = kmem_alloc(addr.len, KM_SLEEP);
374	addr.maxlen = addr.len;
375	if (copyin(userbufptr, addr.buf, addr.len)) {
376		kmem_free(addr.buf, addr.len);
377		error = EFAULT;
378		goto errout;
379	}
380	bcopy(&addr, nargs->addr, sizeof (struct netbuf));
381
382	/*
383	 * Get the root fhandle
384	 */
385
386	if (copyin(STRUCT_FGETP(args, fh), &fh->fh_buf, NFS_FHSIZE)) {
387		error = EFAULT;
388		goto errout;
389	}
390	fh->fh_len = NFS_FHSIZE;
391
392	/*
393	 * Get server's hostname
394	 */
395	if (flags & NFSMNT_HOSTNAME) {
396		error = copyinstr(STRUCT_FGETP(args, hostname),
397			netname, sizeof (netname), &hlen);
398		if (error)
399			goto errout;
400		nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
401		(void) strcpy(nargs->hostname, netname);
402
403	} else {
404		nargs->hostname = NULL;
405	}
406
407
408	/*
409	 * If there are syncaddr and netname data, load them in. This is
410	 * to support data needed for NFSV4 when AUTH_DH is the negotiated
411	 * flavor via SECINFO. (instead of using MOUNT protocol in V3).
412	 */
413	netname[0] = '\0';
414	if (flags & NFSMNT_SECURE) {
415		if (STRUCT_FGETP(args, syncaddr) == NULL) {
416			error = EINVAL;
417			goto errout;
418		}
419		/* get syncaddr */
420		STRUCT_INIT(addr_tmp, get_udatamodel());
421		if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
422			STRUCT_SIZE(addr_tmp))) {
423			error = EINVAL;
424			goto errout;
425		}
426		userbufptr = STRUCT_FGETP(addr_tmp, buf);
427		syncaddr.len = STRUCT_FGET(addr_tmp, len);
428		syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
429		syncaddr.maxlen = syncaddr.len;
430		if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
431			kmem_free(syncaddr.buf, syncaddr.len);
432			error = EFAULT;
433			goto errout;
434		}
435
436		nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
437		bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
438
439		ASSERT(STRUCT_FGETP(args, netname));
440		if (copyinstr(STRUCT_FGETP(args, netname), netname,
441			sizeof (netname), &nlen)) {
442			error = EFAULT;
443			goto errout;
444		}
445
446		netname[nlen] = '\0';
447		nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
448		(void) strcpy(nargs->netname, netname);
449	}
450
451	/*
452	 * Get the extention data which has the security data structure.
453	 * This includes data for AUTH_SYS as well.
454	 */
455	if (flags & NFSMNT_NEWARGS) {
456		nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
457		if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
458			nargs->nfs_args_ext == NFS_ARGS_EXTB) {
459			/*
460			 * Indicating the application is using the new
461			 * sec_data structure to pass in the security
462			 * data.
463			 */
464			if (STRUCT_FGETP(args,
465				nfs_ext_u.nfs_extA.secdata) != NULL) {
466				error = sec_clnt_loadinfo(
467					(struct sec_data *)STRUCT_FGETP(args,
468						nfs_ext_u.nfs_extA.secdata),
469						&secdata, get_udatamodel());
470			}
471			nargs->nfs_ext_u.nfs_extA.secdata = secdata;
472		}
473	}
474
475	if (error)
476		goto errout;
477
478	/*
479	 * Failover support:
480	 *
481	 * We may have a linked list of nfs_args structures,
482	 * which means the user is looking for failover.  If
483	 * the mount is either not "read-only" or "soft",
484	 * we want to bail out with EINVAL.
485	 */
486	if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
487		nargs->nfs_ext_u.nfs_extB.next =
488			STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
489
490errout:
491	if (error)
492		nfs_free_args(nargs, fh);
493
494	return (error);
495}
496
497
498/*
499 * nfs mount vfsop
500 * Set up mount info record and attach it to vfs struct.
501 */
502static int
503nfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
504{
505	char *data = uap->dataptr;
506	int error;
507	vnode_t *rtvp;			/* the server's root */
508	mntinfo_t *mi;			/* mount info, pointed at by vfs */
509	size_t nlen;			/* length of netname */
510	struct knetconfig *knconf;	/* transport knetconfig structure */
511	struct knetconfig *rdma_knconf;	/* rdma transport structure */
512	rnode_t *rp;
513	struct servinfo *svp;		/* nfs server info */
514	struct servinfo *svp_tail = NULL; /* previous nfs server info */
515	struct servinfo *svp_head;	/* first nfs server info */
516	struct servinfo *svp_2ndlast;	/* 2nd last in the server info list */
517	struct sec_data *secdata;	/* security data */
518	struct nfs_args	*args = NULL;
519	int flags, addr_type;
520	zone_t *zone = nfs_zone();
521	zone_t *mntzone = NULL;
522	nfs_fhandle	*fhandle = NULL;
523
524	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
525		return (error);
526
527	if (mvp->v_type != VDIR)
528		return (ENOTDIR);
529
530	/*
531	 * get arguments
532	 *
533	 * nfs_args is now versioned and is extensible, so
534	 * uap->datalen might be different from sizeof (args)
535	 * in a compatible situation.
536	 */
537more:
538
539	if (!(uap->flags & MS_SYSSPACE)) {
540		if (args == NULL)
541			args = kmem_alloc(sizeof (struct nfs_args), KM_SLEEP);
542		else {
543			nfs_free_args(args, fhandle);
544			fhandle = NULL;
545		}
546		if (fhandle == NULL)
547			fhandle = kmem_zalloc(sizeof (nfs_fhandle), KM_SLEEP);
548		error = nfs_copyin(data, uap->datalen, args, fhandle);
549		if (error)  {
550			if (args)
551				kmem_free(args, sizeof (*args));
552			return (error);
553		}
554	} else {
555		args = (struct nfs_args *)data;
556		fhandle = (nfs_fhandle *)args->fh;
557	}
558
559
560	flags = args->flags;
561
562	if (uap->flags & MS_REMOUNT) {
563		size_t n;
564		char name[FSTYPSZ];
565
566		if (uap->flags & MS_SYSSPACE)
567			error = copystr(uap->fstype, name, FSTYPSZ, &n);
568		else
569			error = copyinstr(uap->fstype, name, FSTYPSZ, &n);
570
571		if (error) {
572			if (error == ENAMETOOLONG)
573				return (EINVAL);
574			return (error);
575		}
576
577
578		/*
579		 * This check is to ensure that the request is a
580		 * genuine nfs remount request.
581		 */
582
583		if (strncmp(name, "nfs", 3) != 0)
584			return (EINVAL);
585
586		/*
587		 * If the request changes the locking type, disallow the
588		 * remount,
589		 * because it's questionable whether we can transfer the
590		 * locking state correctly.
591		 *
592		 * Remounts need to save the pathconf information.
593		 * Part of the infamous static kludge.
594		 */
595
596		if ((mi = VFTOMI(vfsp)) != NULL) {
597			uint_t new_mi_llock;
598			uint_t old_mi_llock;
599
600			new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
601			old_mi_llock = (mi->mi_flags & MI_LLOCK) ? 1 : 0;
602			if (old_mi_llock != new_mi_llock)
603				return (EBUSY);
604		}
605		error = pathconf_get((struct mntinfo *)vfsp->vfs_data, args);
606
607		if (!(uap->flags & MS_SYSSPACE)) {
608			nfs_free_args(args, fhandle);
609			kmem_free(args, sizeof (*args));
610		}
611
612		return (error);
613	}
614
615	mutex_enter(&mvp->v_lock);
616	if (!(uap->flags & MS_OVERLAY) &&
617	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
618		mutex_exit(&mvp->v_lock);
619		if (!(uap->flags & MS_SYSSPACE)) {
620			nfs_free_args(args, fhandle);
621			kmem_free(args, sizeof (*args));
622		}
623		return (EBUSY);
624	}
625	mutex_exit(&mvp->v_lock);
626
627	/* make sure things are zeroed for errout: */
628	rtvp = NULL;
629	mi = NULL;
630	secdata = NULL;
631
632	/*
633	 * A valid knetconfig structure is required.
634	 */
635	if (!(flags & NFSMNT_KNCONF)) {
636		if (!(uap->flags & MS_SYSSPACE)) {
637			nfs_free_args(args, fhandle);
638			kmem_free(args, sizeof (*args));
639		}
640		return (EINVAL);
641	}
642
643	if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
644		(strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
645		if (!(uap->flags & MS_SYSSPACE)) {
646			nfs_free_args(args, fhandle);
647			kmem_free(args, sizeof (*args));
648		}
649		return (EINVAL);
650	}
651
652
653	/*
654	 * Allocate a servinfo struct.
655	 */
656	svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
657	mutex_init(&svp->sv_lock, NULL, MUTEX_DEFAULT, NULL);
658	if (svp_tail) {
659		svp_2ndlast = svp_tail;
660		svp_tail->sv_next = svp;
661	} else {
662		svp_head = svp;
663		svp_2ndlast = svp;
664	}
665
666	svp_tail = svp;
667
668	/*
669	 * Get knetconfig and server address
670	 */
671	svp->sv_knconf = args->knconf;
672	args->knconf = NULL;
673
674	if (args->addr == NULL || args->addr->buf == NULL) {
675		error = EINVAL;
676		goto errout;
677	}
678
679	svp->sv_addr.maxlen = args->addr->maxlen;
680	svp->sv_addr.len = args->addr->len;
681	svp->sv_addr.buf = args->addr->buf;
682	args->addr->buf = NULL;
683
684	/*
685	 * Get the root fhandle
686	 */
687	ASSERT(fhandle);
688
689	bcopy(&fhandle->fh_buf, &svp->sv_fhandle.fh_buf, fhandle->fh_len);
690	svp->sv_fhandle.fh_len = fhandle->fh_len;
691
692	/*
693	 * Get server's hostname
694	 */
695	if (flags & NFSMNT_HOSTNAME) {
696		if (args->hostname == NULL) {
697			error = EINVAL;
698			goto errout;
699		}
700		svp->sv_hostnamelen = strlen(args->hostname) + 1;
701		svp->sv_hostname = args->hostname;
702		args->hostname = NULL;
703	} else {
704		char *p = "unknown-host";
705		svp->sv_hostnamelen = strlen(p) + 1;
706		svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
707		(void) strcpy(svp->sv_hostname, p);
708	}
709
710
711	/*
712	 * RDMA MOUNT SUPPORT FOR NFS v2:
713	 * Establish, is it possible to use RDMA, if so overload the
714	 * knconf with rdma specific knconf and free the orignal.
715	 */
716	if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
717		/*
718		 * Determine the addr type for RDMA, IPv4 or v6.
719		 */
720		if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
721			addr_type = AF_INET;
722		else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
723			addr_type = AF_INET6;
724
725		if (rdma_reachable(addr_type, &svp->sv_addr,
726			&rdma_knconf) == 0) {
727			/*
728			 * If successful, hijack, the orignal knconf and
729			 * replace with a new one, depending on the flags.
730			 */
731			svp->sv_origknconf = svp->sv_knconf;
732			svp->sv_knconf = rdma_knconf;
733			knconf = rdma_knconf;
734		} else {
735			if (flags & NFSMNT_TRYRDMA) {
736#ifdef	DEBUG
737				if (rdma_debug)
738					zcmn_err(getzoneid(), CE_WARN,
739					    "no RDMA onboard, revert\n");
740#endif
741			}
742
743			if (flags & NFSMNT_DORDMA) {
744				/*
745				 * If proto=rdma is specified and no RDMA
746				 * path to this server is avialable then
747				 * ditch this server.
748				 * This is not included in the mountable
749				 * server list or the replica list.
750				 * Check if more servers are specified;
751				 * Failover case, otherwise bail out of mount.
752				 */
753				if (args->nfs_args_ext ==
754				    NFS_ARGS_EXTB &&
755					args->nfs_ext_u.nfs_extB.next
756					!= NULL) {
757					data = (char *)
758						args->nfs_ext_u.nfs_extB.next;
759					if (uap->flags & MS_RDONLY &&
760					    !(flags & NFSMNT_SOFT)) {
761						if (svp_head->sv_next == NULL) {
762							svp_tail = NULL;
763							svp_2ndlast = NULL;
764							sv_free(svp_head);
765							goto more;
766						} else {
767							svp_tail = svp_2ndlast;
768							svp_2ndlast->sv_next =
769							    NULL;
770							sv_free(svp);
771							goto more;
772						}
773					}
774				} else {
775					/*
776					 * This is the last server specified
777					 * in the nfs_args list passed down
778					 * and its not rdma capable.
779					 */
780					if (svp_head->sv_next == NULL) {
781						/*
782						 * Is this the only one
783						 */
784						error = EINVAL;
785#ifdef	DEBUG
786						if (rdma_debug)
787							zcmn_err(getzoneid(),
788							    CE_WARN,
789							    "No RDMA srv");
790#endif
791						goto errout;
792					} else {
793						/*
794						 * There is list, since some
795						 * servers specified before
796						 * this passed all requirements
797						 */
798						svp_tail = svp_2ndlast;
799						svp_2ndlast->sv_next = NULL;
800						sv_free(svp);
801						goto proceed;
802					}
803				}
804			}
805		}
806	}
807
808	/*
809	 * Get the extention data which has the new security data structure.
810	 */
811	if (flags & NFSMNT_NEWARGS) {
812		switch (args->nfs_args_ext) {
813		case NFS_ARGS_EXTA:
814		case NFS_ARGS_EXTB:
815			/*
816			 * Indicating the application is using the new
817			 * sec_data structure to pass in the security
818			 * data.
819			 */
820			secdata = args->nfs_ext_u.nfs_extA.secdata;
821			if (secdata == NULL) {
822				error = EINVAL;
823			} else {
824				/*
825				 * Need to validate the flavor here if
826				 * sysspace, userspace was already
827				 * validate from the nfs_copyin function.
828				 */
829				switch (secdata->rpcflavor) {
830					case AUTH_NONE:
831					case AUTH_UNIX:
832					case AUTH_LOOPBACK:
833					case AUTH_DES:
834					case RPCSEC_GSS:
835						break;
836					default:
837						error = EINVAL;
838						goto errout;
839				}
840			}
841			args->nfs_ext_u.nfs_extA.secdata = NULL;
842			break;
843
844		default:
845			error = EINVAL;
846			break;
847		}
848	} else if (flags & NFSMNT_SECURE) {
849		/*
850		 * Keep this for backward compatibility to support
851		 * NFSMNT_SECURE/NFSMNT_RPCTIMESYNC flags.
852		 */
853		if (args->syncaddr == NULL || args->syncaddr->buf == NULL) {
854			error = EINVAL;
855			goto errout;
856		}
857
858		/*
859		 * get time sync address.
860		 */
861		if (args->syncaddr == NULL) {
862			error = EFAULT;
863			goto errout;
864		}
865
866		/*
867		 * Move security related data to the sec_data structure.
868		 */
869		{
870			dh_k4_clntdata_t *data;
871			char *pf, *p;
872
873			secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
874			if (flags & NFSMNT_RPCTIMESYNC)
875				secdata->flags |= AUTH_F_RPCTIMESYNC;
876			data = kmem_alloc(sizeof (*data), KM_SLEEP);
877			bcopy(args->syncaddr, &data->syncaddr,
878				sizeof (*args->syncaddr));
879
880
881			/*
882			 * duplicate the knconf information for the
883			 * new opaque data.
884			 */
885			data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
886			*data->knconf = *knconf;
887			pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
888			p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
889			bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
890			bcopy(knconf->knc_proto, pf, KNC_STRSIZE);
891			data->knconf->knc_protofmly = pf;
892			data->knconf->knc_proto = p;
893
894			/* move server netname to the sec_data structure */
895			nlen = strlen(args->hostname) + 1;
896			if (nlen != 0) {
897				data->netname = kmem_alloc(nlen, KM_SLEEP);
898				bcopy(args->hostname, data->netname, nlen);
899				data->netnamelen = (int)nlen;
900			}
901			secdata->secmod = secdata->rpcflavor = AUTH_DES;
902			secdata->data = (caddr_t)data;
903		}
904	} else {
905		secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
906		secdata->secmod = secdata->rpcflavor = AUTH_UNIX;
907		secdata->data = NULL;
908	}
909	svp->sv_secdata = secdata;
910
911	/*
912	 * See bug 1180236.
913	 * If mount secure failed, we will fall back to AUTH_NONE
914	 * and try again.  nfs3rootvp() will turn this back off.
915	 *
916	 * The NFS Version 2 mount uses GETATTR and STATFS procedures.
917	 * The server does not care if these procedures have the proper
918	 * authentication flavor, so if mount retries using AUTH_NONE
919	 * that does not require a credential setup for root then the
920	 * automounter would work without requiring root to be
921	 * keylogged into AUTH_DES.
922	 */
923	if (secdata->rpcflavor != AUTH_UNIX &&
924	    secdata->rpcflavor != AUTH_LOOPBACK)
925		secdata->flags |= AUTH_F_TRYNONE;
926
927	/*
928	 * Failover support:
929	 *
930	 * We may have a linked list of nfs_args structures,
931	 * which means the user is looking for failover.  If
932	 * the mount is either not "read-only" or "soft",
933	 * we want to bail out with EINVAL.
934	 */
935	if (args->nfs_args_ext == NFS_ARGS_EXTB &&
936	    args->nfs_ext_u.nfs_extB.next != NULL) {
937		if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
938			data = (char *)args->nfs_ext_u.nfs_extB.next;
939			goto more;
940		}
941		error = EINVAL;
942		goto errout;
943	}
944
945	/*
946	 * Determine the zone we're being mounted into.
947	 */
948	zone_hold(mntzone = zone);		/* start with this assumption */
949	if (getzoneid() == GLOBAL_ZONEID) {
950		zone_rele(mntzone);
951		mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
952		ASSERT(mntzone != NULL);
953		if (mntzone != zone) {
954			error = EBUSY;
955			goto errout;
956		}
957	}
958
959	if (is_system_labeled()) {
960		error = nfs_mount_label_policy(vfsp, &svp->sv_addr,
961		    svp->sv_knconf, cr);
962
963		if (error > 0)
964			goto errout;
965
966		if (error == -1) {
967			/* change mount to read-only to prevent write-down */
968			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
969		}
970	}
971
972	/*
973	 * Stop the mount from going any further if the zone is going away.
974	 */
975	if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
976		error = EBUSY;
977		goto errout;
978	}
979
980	/*
981	 * Get root vnode.
982	 */
983proceed:
984	error = nfsrootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
985
986	if (error)
987		goto errout;
988
989	/*
990	 * Set option fields in the mount info record
991	 */
992	mi = VTOMI(rtvp);
993
994	if (svp_head->sv_next)
995		mi->mi_flags |= MI_LLOCK;
996
997	error = nfs_setopts(rtvp, DATAMODEL_NATIVE, args);
998	if (!error) {
999		/* static pathconf kludge */
1000		error = pathconf_get(mi, args);
1001	}
1002
1003errout:
1004	if (error) {
1005		if (rtvp != NULL) {
1006			rp = VTOR(rtvp);
1007			if (rp->r_flags & RHASHED)
1008				rp_rmhash(rp);
1009		}
1010		sv_free(svp_head);
1011		if (mi != NULL) {
1012			nfs_async_stop(vfsp);
1013			nfs_async_manager_stop(vfsp);
1014			if (mi->mi_io_kstats) {
1015				kstat_delete(mi->mi_io_kstats);
1016				mi->mi_io_kstats = NULL;
1017			}
1018			if (mi->mi_ro_kstats) {
1019				kstat_delete(mi->mi_ro_kstats);
1020				mi->mi_ro_kstats = NULL;
1021			}
1022			nfs_free_mi(mi);
1023		}
1024	}
1025
1026	if (!(uap->flags & MS_SYSSPACE)) {
1027		nfs_free_args(args, fhandle);
1028		kmem_free(args, sizeof (*args));
1029	}
1030
1031	if (rtvp != NULL)
1032		VN_RELE(rtvp);
1033
1034	if (mntzone != NULL)
1035		zone_rele(mntzone);
1036
1037	return (error);
1038}
1039
1040/*
1041 * The pathconf information is kept on a linked list of kmem_alloc'ed
1042 * structs. We search the list & add a new struct iff there is no other
1043 * struct with the same information.
1044 * See sys/pathconf.h for ``the rest of the story.''
1045 */
1046static struct pathcnf *allpc = NULL;
1047
1048static int
1049pathconf_copyin(struct nfs_args *args, struct pathcnf *pc)
1050{
1051	STRUCT_DECL(pathcnf, pc_tmp);
1052	STRUCT_HANDLE(nfs_args, ap);
1053	int i;
1054	model_t	model;
1055
1056	model = get_udatamodel();
1057	STRUCT_INIT(pc_tmp, model);
1058	STRUCT_SET_HANDLE(ap, model, args);
1059
1060	if ((STRUCT_FGET(ap, flags) & NFSMNT_POSIX) &&
1061	    STRUCT_FGETP(ap, pathconf) != NULL) {
1062		if (copyin(STRUCT_FGETP(ap, pathconf), STRUCT_BUF(pc_tmp),
1063		    STRUCT_SIZE(pc_tmp)))
1064			return (EFAULT);
1065		if (_PC_ISSET(_PC_ERROR, STRUCT_FGET(pc_tmp, pc_mask)))
1066			return (EINVAL);
1067
1068		pc->pc_link_max = STRUCT_FGET(pc_tmp, pc_link_max);
1069		pc->pc_max_canon = STRUCT_FGET(pc_tmp, pc_max_canon);
1070		pc->pc_max_input = STRUCT_FGET(pc_tmp, pc_max_input);
1071		pc->pc_name_max = STRUCT_FGET(pc_tmp, pc_name_max);
1072		pc->pc_path_max = STRUCT_FGET(pc_tmp, pc_path_max);
1073		pc->pc_pipe_buf = STRUCT_FGET(pc_tmp, pc_pipe_buf);
1074		pc->pc_vdisable = STRUCT_FGET(pc_tmp, pc_vdisable);
1075		pc->pc_xxx = STRUCT_FGET(pc_tmp, pc_xxx);
1076		for (i = 0; i < _PC_N; i++)
1077			pc->pc_mask[i] = STRUCT_FGET(pc_tmp, pc_mask[i]);
1078	}
1079	return (0);
1080}
1081
1082static int
1083pathconf_get(struct mntinfo *mi, struct nfs_args *args)
1084{
1085	struct pathcnf *p, *pc;
1086
1087	pc = args->pathconf;
1088	if (mi->mi_pathconf != NULL) {
1089		pathconf_rele(mi);
1090		mi->mi_pathconf = NULL;
1091	}
1092	if (args->flags & NFSMNT_POSIX &&
1093		args->pathconf != NULL) {
1094
1095		if (_PC_ISSET(_PC_ERROR, pc->pc_mask))
1096			return (EINVAL);
1097
1098		for (p = allpc; p != NULL; p = p->pc_next) {
1099			if (PCCMP(p, pc) == 0)
1100				break;
1101		}
1102		if (p != NULL) {
1103			mi->mi_pathconf = p;
1104			p->pc_refcnt++;
1105		} else {
1106			p = kmem_alloc(sizeof (*p), KM_SLEEP);
1107			bcopy(pc, p, sizeof (struct pathcnf));
1108			p->pc_next = allpc;
1109			p->pc_refcnt = 1;
1110			allpc = mi->mi_pathconf = p;
1111		}
1112	}
1113	return (0);
1114}
1115
1116/*
1117 * release the static pathconf information
1118 */
1119static void
1120pathconf_rele(struct mntinfo *mi)
1121{
1122	if (mi->mi_pathconf != NULL) {
1123		if (--mi->mi_pathconf->pc_refcnt == 0) {
1124			struct pathcnf *p;
1125			struct pathcnf *p2;
1126
1127			p2 = p = allpc;
1128			while (p != NULL && p != mi->mi_pathconf) {
1129				p2 = p;
1130				p = p->pc_next;
1131			}
1132			if (p == NULL) {
1133				panic("mi->pathconf");
1134				/*NOTREACHED*/
1135			}
1136			if (p == allpc)
1137				allpc = p->pc_next;
1138			else
1139				p2->pc_next = p->pc_next;
1140			kmem_free(p, sizeof (*p));
1141			mi->mi_pathconf = NULL;
1142		}
1143	}
1144}
1145
1146static int nfs_dynamic = 1;	/* global variable to enable dynamic retrans. */
1147static ushort_t nfs_max_threads = 8;	/* max number of active async threads */
1148static uint_t nfs_async_clusters = 1;	/* # of reqs from each async queue */
1149static uint_t nfs_cots_timeo = NFS_COTS_TIMEO;
1150
1151static int
1152nfsrootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo *svp,
1153	int flags, cred_t *cr, zone_t *zone)
1154{
1155	vnode_t *rtvp;
1156	mntinfo_t *mi;
1157	dev_t nfs_dev;
1158	struct vattr va;
1159	int error;
1160	rnode_t *rp;
1161	int i;
1162	struct nfs_stats *nfsstatsp;
1163	cred_t *lcr = NULL, *tcr = cr;
1164
1165	nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
1166	ASSERT(nfsstatsp != NULL);
1167
1168	/*
1169	 * Create a mount record and link it to the vfs struct.
1170	 */
1171	mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
1172	mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
1173	mutex_init(&mi->mi_remap_lock, NULL, MUTEX_DEFAULT, NULL);
1174	mi->mi_flags = MI_ACL | MI_EXTATTR;
1175	if (!(flags & NFSMNT_SOFT))
1176		mi->mi_flags |= MI_HARD;
1177	if ((flags & NFSMNT_SEMISOFT))
1178		mi->mi_flags |= MI_SEMISOFT;
1179	if ((flags & NFSMNT_NOPRINT))
1180		mi->mi_flags |= MI_NOPRINT;
1181	if (flags & NFSMNT_INT)
1182		mi->mi_flags |= MI_INT;
1183	mi->mi_retrans = NFS_RETRIES;
1184	if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1185	    svp->sv_knconf->knc_semantics == NC_TPI_COTS)
1186		mi->mi_timeo = nfs_cots_timeo;
1187	else
1188		mi->mi_timeo = NFS_TIMEO;
1189	mi->mi_prog = NFS_PROGRAM;
1190	mi->mi_vers = NFS_VERSION;
1191	mi->mi_rfsnames = rfsnames_v2;
1192	mi->mi_reqs = nfsstatsp->nfs_stats_v2.rfsreqcnt_ptr;
1193	mi->mi_call_type = call_type_v2;
1194	mi->mi_ss_call_type = ss_call_type_v2;
1195	mi->mi_timer_type = timer_type_v2;
1196	mi->mi_aclnames = aclnames_v2;
1197	mi->mi_aclreqs = nfsstatsp->nfs_stats_v2.aclreqcnt_ptr;
1198	mi->mi_acl_call_type = acl_call_type_v2;
1199	mi->mi_acl_ss_call_type = acl_ss_call_type_v2;
1200	mi->mi_acl_timer_type = acl_timer_type_v2;
1201	cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
1202	mi->mi_servers = svp;
1203	mi->mi_curr_serv = svp;
1204	mi->mi_acregmin = SEC2HR(ACREGMIN);
1205	mi->mi_acregmax = SEC2HR(ACREGMAX);
1206	mi->mi_acdirmin = SEC2HR(ACDIRMIN);
1207	mi->mi_acdirmax = SEC2HR(ACDIRMAX);
1208
1209	if (nfs_dynamic)
1210		mi->mi_flags |= MI_DYNAMIC;
1211
1212	if (flags & NFSMNT_DIRECTIO)
1213		mi->mi_flags |= MI_DIRECTIO;
1214
1215	/*
1216	 * Make a vfs struct for nfs.  We do this here instead of below
1217	 * because rtvp needs a vfs before we can do a getattr on it.
1218	 *
1219	 * Assign a unique device id to the mount
1220	 */
1221	mutex_enter(&nfs_minor_lock);
1222	do {
1223		nfs_minor = (nfs_minor + 1) & MAXMIN32;
1224		nfs_dev = makedevice(nfs_major, nfs_minor);
1225	} while (vfs_devismounted(nfs_dev));
1226	mutex_exit(&nfs_minor_lock);
1227
1228	vfsp->vfs_dev = nfs_dev;
1229	vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfsfstyp);
1230	vfsp->vfs_data = (caddr_t)mi;
1231	vfsp->vfs_fstype = nfsfstyp;
1232	vfsp->vfs_bsize = NFS_MAXDATA;
1233
1234	/*
1235	 * Initialize fields used to support async putpage operations.
1236	 */
1237	for (i = 0; i < NFS_ASYNC_TYPES; i++)
1238		mi->mi_async_clusters[i] = nfs_async_clusters;
1239	mi->mi_async_init_clusters = nfs_async_clusters;
1240	mi->mi_async_curr = &mi->mi_async_reqs[0];
1241	mi->mi_max_threads = nfs_max_threads;
1242	mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
1243	cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
1244	cv_init(&mi->mi_async_work_cv, NULL, CV_DEFAULT, NULL);
1245	cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
1246
1247	mi->mi_vfsp = vfsp;
1248	zone_hold(mi->mi_zone = zone);
1249	nfs_mi_zonelist_add(mi);
1250
1251	/*
1252	 * Make the root vnode, use it to get attributes,
1253	 * then remake it with the attributes.
1254	 */
1255	rtvp = makenfsnode((fhandle_t *)svp->sv_fhandle.fh_buf,
1256	    NULL, vfsp, gethrtime(), cr, NULL, NULL);
1257
1258	va.va_mask = AT_ALL;
1259
1260	/*
1261	 * If the uid is set then set the creds for secure mounts
1262	 * by proxy processes such as automountd.
1263	 */
1264	if (svp->sv_secdata->uid != 0 &&
1265	    svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
1266		lcr = crdup(cr);
1267		(void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
1268		tcr = lcr;
1269	}
1270
1271	error = nfsgetattr(rtvp, &va, tcr);
1272	if (error)
1273		goto bad;
1274	rtvp->v_type = va.va_type;
1275
1276	/*
1277	 * Poll every server to get the filesystem stats; we're
1278	 * only interested in the server's transfer size, and we
1279	 * want the minimum.
1280	 *
1281	 * While we're looping, we'll turn off AUTH_F_TRYNONE,
1282	 * which is only for the mount operation.
1283	 */
1284
1285	mi->mi_tsize = MIN(NFS_MAXDATA, nfstsize());
1286	mi->mi_stsize = MIN(NFS_MAXDATA, nfstsize());
1287
1288	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
1289		struct nfsstatfs fs;
1290		int douprintf;
1291
1292		douprintf = 1;
1293		mi->mi_curr_serv = svp;
1294
1295		error = rfs2call(mi, RFS_STATFS,
1296			xdr_fhandle, (caddr_t)svp->sv_fhandle.fh_buf,
1297			xdr_statfs, (caddr_t)&fs, tcr, &douprintf,
1298			&fs.fs_status, 0, NULL);
1299		if (error)
1300			goto bad;
1301		mi->mi_stsize = MIN(mi->mi_stsize, fs.fs_tsize);
1302		svp->sv_secdata->flags &= ~AUTH_F_TRYNONE;
1303	}
1304	mi->mi_curr_serv = mi->mi_servers;
1305	mi->mi_curread = mi->mi_tsize;
1306	mi->mi_curwrite = mi->mi_stsize;
1307
1308	/*
1309	 * Start the manager thread responsible for handling async worker
1310	 * threads.
1311	 */
1312	VFS_HOLD(vfsp);	/* add reference for thread */
1313	mi->mi_manager_thread = zthread_create(NULL, 0, nfs_async_manager,
1314					vfsp, 0, minclsyspri);
1315	ASSERT(mi->mi_manager_thread != NULL);
1316
1317	/*
1318	 * Initialize kstats
1319	 */
1320	nfs_mnt_kstat_init(vfsp);
1321
1322	mi->mi_type = rtvp->v_type;
1323
1324	*rtvpp = rtvp;
1325	if (lcr != NULL)
1326		crfree(lcr);
1327
1328	return (0);
1329bad:
1330	/*
1331	 * An error occurred somewhere, need to clean up...
1332	 * We need to release our reference to the root vnode and
1333	 * destroy the mntinfo struct that we just created.
1334	 */
1335	if (lcr != NULL)
1336		crfree(lcr);
1337	rp = VTOR(rtvp);
1338	if (rp->r_flags & RHASHED)
1339		rp_rmhash(rp);
1340	VN_RELE(rtvp);
1341	nfs_async_stop(vfsp);
1342	nfs_async_manager_stop(vfsp);
1343	if (mi->mi_io_kstats) {
1344		kstat_delete(mi->mi_io_kstats);
1345		mi->mi_io_kstats = NULL;
1346	}
1347	if (mi->mi_ro_kstats) {
1348		kstat_delete(mi->mi_ro_kstats);
1349		mi->mi_ro_kstats = NULL;
1350	}
1351	nfs_free_mi(mi);
1352	*rtvpp = NULL;
1353	return (error);
1354}
1355
1356/*
1357 * vfs operations
1358 */
1359static int
1360nfs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
1361{
1362	mntinfo_t *mi;
1363	ushort_t omax;
1364
1365	if (secpolicy_fs_unmount(cr, vfsp) != 0)
1366		return (EPERM);
1367
1368	mi = VFTOMI(vfsp);
1369	if (flag & MS_FORCE) {
1370
1371		vfsp->vfs_flag |= VFS_UNMOUNTED;
1372
1373		/*
1374		 * We are about to stop the async manager.
1375		 * Let every one know not to schedule any
1376		 * more async requests.
1377		 */
1378		mutex_enter(&mi->mi_async_lock);
1379		mi->mi_max_threads = 0;
1380		cv_broadcast(&mi->mi_async_work_cv);
1381		mutex_exit(&mi->mi_async_lock);
1382
1383		/*
1384		 * We need to stop the manager thread explicitly; the worker
1385		 * threads can time out and exit on their own.
1386		 */
1387		nfs_async_manager_stop(vfsp);
1388		destroy_rtable(vfsp, cr);
1389		if (mi->mi_io_kstats) {
1390			kstat_delete(mi->mi_io_kstats);
1391			mi->mi_io_kstats = NULL;
1392		}
1393		if (mi->mi_ro_kstats) {
1394			kstat_delete(mi->mi_ro_kstats);
1395			mi->mi_ro_kstats = NULL;
1396		}
1397		return (0);
1398	}
1399	/*
1400	 * Wait until all asynchronous putpage operations on
1401	 * this file system are complete before flushing rnodes
1402	 * from the cache.
1403	 */
1404	omax = mi->mi_max_threads;
1405	if (nfs_async_stop_sig(vfsp)) {
1406		return (EINTR);
1407	}
1408	rflush(vfsp, cr);
1409	/*
1410	 * If there are any active vnodes on this file system,
1411	 * then the file system is busy and can't be umounted.
1412	 */
1413	if (check_rtable(vfsp)) {
1414		mutex_enter(&mi->mi_async_lock);
1415		mi->mi_max_threads = omax;
1416		mutex_exit(&mi->mi_async_lock);
1417		return (EBUSY);
1418	}
1419	/*
1420	 * The unmount can't fail from now on; stop the manager thread.
1421	 */
1422	nfs_async_manager_stop(vfsp);
1423	/*
1424	 * Destroy all rnodes belonging to this file system from the
1425	 * rnode hash queues and purge any resources allocated to
1426	 * them.
1427	 */
1428	destroy_rtable(vfsp, cr);
1429	if (mi->mi_io_kstats) {
1430		kstat_delete(mi->mi_io_kstats);
1431		mi->mi_io_kstats = NULL;
1432	}
1433	if (mi->mi_ro_kstats) {
1434		kstat_delete(mi->mi_ro_kstats);
1435		mi->mi_ro_kstats = NULL;
1436	}
1437	return (0);
1438}
1439
1440/*
1441 * find root of nfs
1442 */
1443static int
1444nfs_root(vfs_t *vfsp, vnode_t **vpp)
1445{
1446	mntinfo_t *mi;
1447	vnode_t *vp;
1448	servinfo_t *svp;
1449	rnode_t *rp;
1450	int error = 0;
1451
1452	mi = VFTOMI(vfsp);
1453
1454	if (nfs_zone() != mi->mi_zone)
1455		return (EPERM);
1456
1457	svp = mi->mi_curr_serv;
1458	if (svp && (svp->sv_flags & SV_ROOT_STALE)) {
1459		mutex_enter(&svp->sv_lock);
1460		svp->sv_flags &= ~SV_ROOT_STALE;
1461		mutex_exit(&svp->sv_lock);
1462		error = ENOENT;
1463	}
1464
1465	vp = makenfsnode((fhandle_t *)mi->mi_curr_serv->sv_fhandle.fh_buf,
1466	    NULL, vfsp, gethrtime(), CRED(), NULL, NULL);
1467
1468	/*
1469	 * if the SV_ROOT_STALE flag was reset above, reset the
1470	 * RSTALE flag if needed and return an error
1471	 */
1472	if (error == ENOENT) {
1473		rp = VTOR(vp);
1474		if (svp && rp->r_flags & RSTALE) {
1475			mutex_enter(&rp->r_statelock);
1476			rp->r_flags &= ~RSTALE;
1477			mutex_exit(&rp->r_statelock);
1478		}
1479		VN_RELE(vp);
1480		return (error);
1481	}
1482
1483	ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
1484
1485	vp->v_type = mi->mi_type;
1486
1487	*vpp = vp;
1488
1489	return (0);
1490}
1491
1492/*
1493 * Get file system statistics.
1494 */
1495static int
1496nfs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
1497{
1498	int error;
1499	mntinfo_t *mi;
1500	struct nfsstatfs fs;
1501	int douprintf;
1502	failinfo_t fi;
1503	vnode_t *vp;
1504
1505	error = nfs_root(vfsp, &vp);
1506	if (error)
1507		return (error);
1508
1509	mi = VFTOMI(vfsp);
1510	douprintf = 1;
1511	fi.vp = vp;
1512	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1513	fi.copyproc = nfscopyfh;
1514	fi.lookupproc = nfslookup;
1515	fi.xattrdirproc = acl_getxattrdir2;
1516
1517	error = rfs2call(mi, RFS_STATFS,
1518			xdr_fhandle, (caddr_t)VTOFH(vp),
1519			xdr_statfs, (caddr_t)&fs, CRED(), &douprintf,
1520			&fs.fs_status, 0, &fi);
1521
1522	if (!error) {
1523		error = geterrno(fs.fs_status);
1524		if (!error) {
1525			mutex_enter(&mi->mi_lock);
1526			if (mi->mi_stsize) {
1527				mi->mi_stsize = MIN(mi->mi_stsize, fs.fs_tsize);
1528			} else {
1529				mi->mi_stsize = fs.fs_tsize;
1530				mi->mi_curwrite = mi->mi_stsize;
1531			}
1532			mutex_exit(&mi->mi_lock);
1533			sbp->f_bsize = fs.fs_bsize;
1534			sbp->f_frsize = fs.fs_bsize;
1535			sbp->f_blocks = (fsblkcnt64_t)fs.fs_blocks;
1536			sbp->f_bfree = (fsblkcnt64_t)fs.fs_bfree;
1537			/*
1538			 * Some servers may return negative available
1539			 * block counts.  They may do this because they
1540			 * calculate the number of available blocks by
1541			 * subtracting the number of used blocks from
1542			 * the total number of blocks modified by the
1543			 * minimum free value.  For example, if the
1544			 * minumum free percentage is 10 and the file
1545			 * system is greater than 90 percent full, then
1546			 * 90 percent of the total blocks minus the
1547			 * actual number of used blocks may be a
1548			 * negative number.
1549			 *
1550			 * In this case, we need to sign extend the
1551			 * negative number through the assignment from
1552			 * the 32 bit bavail count to the 64 bit bavail
1553			 * count.
1554			 *
1555			 * We need to be able to discern between there
1556			 * just being a lot of available blocks on the
1557			 * file system and the case described above.
1558			 * We are making the assumption that it does
1559			 * not make sense to have more available blocks
1560			 * than there are free blocks.  So, if there
1561			 * are, then we treat the number as if it were
1562			 * a negative number and arrange to have it
1563			 * sign extended when it is converted from 32
1564			 * bits to 64 bits.
1565			 */
1566			if (fs.fs_bavail <= fs.fs_bfree)
1567				sbp->f_bavail = (fsblkcnt64_t)fs.fs_bavail;
1568			else {
1569				sbp->f_bavail =
1570					(fsblkcnt64_t)((long)fs.fs_bavail);
1571			}
1572			sbp->f_files = (fsfilcnt64_t)-1;
1573			sbp->f_ffree = (fsfilcnt64_t)-1;
1574			sbp->f_favail = (fsfilcnt64_t)-1;
1575			sbp->f_fsid = (unsigned long)vfsp->vfs_fsid.val[0];
1576			(void) strncpy(sbp->f_basetype,
1577				vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
1578			sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
1579			sbp->f_namemax = (uint32_t)-1;
1580		} else {
1581			PURGE_STALE_FH(error, vp, CRED());
1582		}
1583	}
1584
1585	VN_RELE(vp);
1586
1587	return (error);
1588}
1589
1590static kmutex_t nfs_syncbusy;
1591
1592/*
1593 * Flush dirty nfs files for file system vfsp.
1594 * If vfsp == NULL, all nfs files are flushed.
1595 */
1596/* ARGSUSED */
1597static int
1598nfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
1599{
1600	/*
1601	 * Cross-zone calls are OK here, since this translates to a
1602	 * VOP_PUTPAGE(B_ASYNC), which gets picked up by the right zone.
1603	 */
1604	if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs_syncbusy) != 0) {
1605		rflush(vfsp, cr);
1606		mutex_exit(&nfs_syncbusy);
1607	}
1608	return (0);
1609}
1610
1611/* ARGSUSED */
1612static int
1613nfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1614{
1615	int error;
1616	vnode_t *vp;
1617	struct vattr va;
1618	struct nfs_fid *nfsfidp = (struct nfs_fid *)fidp;
1619	zoneid_t zoneid = VFTOMI(vfsp)->mi_zone->zone_id;
1620
1621	if (nfs_zone() != VFTOMI(vfsp)->mi_zone)
1622		return (EPERM);
1623	if (fidp->fid_len != (sizeof (*nfsfidp) - sizeof (short))) {
1624#ifdef DEBUG
1625		zcmn_err(zoneid, CE_WARN,
1626		    "nfs_vget: bad fid len, %d/%d", fidp->fid_len,
1627		    (int)(sizeof (*nfsfidp) - sizeof (short)));
1628#endif
1629		*vpp = NULL;
1630		return (ESTALE);
1631	}
1632
1633	vp = makenfsnode((fhandle_t *)(nfsfidp->nf_data), NULL, vfsp,
1634	    gethrtime(), CRED(), NULL, NULL);
1635
1636	if (VTOR(vp)->r_flags & RSTALE) {
1637		VN_RELE(vp);
1638		*vpp = NULL;
1639		return (ENOENT);
1640	}
1641
1642	if (vp->v_type == VNON) {
1643		va.va_mask = AT_ALL;
1644		error = nfsgetattr(vp, &va, CRED());
1645		if (error) {
1646			VN_RELE(vp);
1647			*vpp = NULL;
1648			return (error);
1649		}
1650		vp->v_type = va.va_type;
1651	}
1652
1653	*vpp = vp;
1654
1655	return (0);
1656}
1657
1658/* ARGSUSED */
1659static int
1660nfs_mountroot(vfs_t *vfsp, whymountroot_t why)
1661{
1662	vnode_t *rtvp;
1663	char root_hostname[SYS_NMLN+1];
1664	struct servinfo *svp;
1665	int error;
1666	int vfsflags;
1667	size_t size;
1668	char *root_path;
1669	struct pathname pn;
1670	char *name;
1671	cred_t *cr;
1672	struct nfs_args args;		/* nfs mount arguments */
1673	static char token[10];
1674
1675	bzero(&args, sizeof (args));
1676
1677	/* do this BEFORE getfile which causes xid stamps to be initialized */
1678	clkset(-1L);		/* hack for now - until we get time svc? */
1679
1680	if (why == ROOT_REMOUNT) {
1681		/*
1682		 * Shouldn't happen.
1683		 */
1684		panic("nfs_mountroot: why == ROOT_REMOUNT");
1685	}
1686
1687	if (why == ROOT_UNMOUNT) {
1688		/*
1689		 * Nothing to do for NFS.
1690		 */
1691		return (0);
1692	}
1693
1694	/*
1695	 * why == ROOT_INIT
1696	 */
1697
1698	name = token;
1699	*name = 0;
1700	getfsname("root", name, sizeof (token));
1701
1702	pn_alloc(&pn);
1703	root_path = pn.pn_path;
1704
1705	svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
1706	svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
1707	svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1708	svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1709
1710	/*
1711	 * Get server address
1712	 * Get the root fhandle
1713	 * Get server's transport
1714	 * Get server's hostname
1715	 * Get options
1716	 */
1717	args.addr = &svp->sv_addr;
1718	args.fh = (char *)&svp->sv_fhandle.fh_buf;
1719	args.knconf = svp->sv_knconf;
1720	args.hostname = root_hostname;
1721	vfsflags = 0;
1722	if (error = mount_root(*name ? name : "root", root_path, NFS_VERSION,
1723	    &args, &vfsflags)) {
1724		nfs_cmn_err(error, CE_WARN,
1725		    "nfs_mountroot: mount_root failed: %m");
1726		sv_free(svp);
1727		pn_free(&pn);
1728		return (error);
1729	}
1730	svp->sv_fhandle.fh_len = NFS_FHSIZE;
1731	svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
1732	svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
1733	(void) strcpy(svp->sv_hostname, root_hostname);
1734
1735	/*
1736	 * Force root partition to always be mounted with AUTH_UNIX for now
1737	 */
1738	svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
1739	svp->sv_secdata->secmod = AUTH_UNIX;
1740	svp->sv_secdata->rpcflavor = AUTH_UNIX;
1741	svp->sv_secdata->data = NULL;
1742
1743	cr = crgetcred();
1744	rtvp = NULL;
1745
1746	error = nfsrootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
1747
1748	crfree(cr);
1749
1750	if (error) {
1751		pn_free(&pn);
1752		sv_free(svp);
1753		return (error);
1754	}
1755
1756	error = nfs_setopts(rtvp, DATAMODEL_NATIVE, &args);
1757	if (error) {
1758		nfs_cmn_err(error, CE_WARN,
1759		    "nfs_mountroot: invalid root mount options");
1760		pn_free(&pn);
1761		goto errout;
1762	}
1763
1764	(void) vfs_lock_wait(vfsp);
1765	vfs_add(NULL, vfsp, vfsflags);
1766	vfs_unlock(vfsp);
1767
1768	size = strlen(svp->sv_hostname);
1769	(void) strcpy(rootfs.bo_name, svp->sv_hostname);
1770	rootfs.bo_name[size] = ':';
1771	(void) strcpy(&rootfs.bo_name[size + 1], root_path);
1772
1773	pn_free(&pn);
1774
1775errout:
1776	if (error) {
1777		sv_free(svp);
1778		nfs_async_stop(vfsp);
1779		nfs_async_manager_stop(vfsp);
1780	}
1781
1782	if (rtvp != NULL)
1783		VN_RELE(rtvp);
1784
1785	return (error);
1786}
1787
1788/*
1789 * Initialization routine for VFS routines.  Should only be called once
1790 */
1791int
1792nfs_vfsinit(void)
1793{
1794	mutex_init(&nfs_syncbusy, NULL, MUTEX_DEFAULT, NULL);
1795	return (0);
1796}
1797
1798void
1799nfs_vfsfini(void)
1800{
1801	mutex_destroy(&nfs_syncbusy);
1802}
1803
1804void
1805nfs_freevfs(vfs_t *vfsp)
1806{
1807	mntinfo_t *mi;
1808	servinfo_t *svp;
1809
1810	/* free up the resources */
1811	mi = VFTOMI(vfsp);
1812	pathconf_rele(mi);
1813	svp = mi->mi_servers;
1814	mi->mi_servers = mi->mi_curr_serv = NULL;
1815	sv_free(svp);
1816
1817	/*
1818	 * By this time we should have already deleted the
1819	 * mi kstats in the unmount code. If they are still around
1820	 * somethings wrong
1821	 */
1822	ASSERT(mi->mi_io_kstats == NULL);
1823	nfs_free_mi(mi);
1824}
1825