1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 *    must display the following acknowledgement:
46 *	This product includes software developed by the University of
47 *	California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 *    may be used to endorse or promote products derived from this software
50 *    without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 *	@(#)nfs_vnops.c	8.16 (Berkeley) 5/27/95
65 * FreeBSD-Id: nfs_vnops.c,v 1.72 1997/11/07 09:20:48 phk Exp $
66 */
67
68
69/*
70 * vnode op calls for Sun NFS version 2 and 3
71 */
72#include <sys/param.h>
73#include <sys/kernel.h>
74#include <sys/systm.h>
75#include <sys/resourcevar.h>
76#include <sys/proc_internal.h>
77#include <sys/kauth.h>
78#include <sys/mount_internal.h>
79#include <sys/malloc.h>
80#include <sys/kpi_mbuf.h>
81#include <sys/conf.h>
82#include <sys/vnode_internal.h>
83#include <sys/dirent.h>
84#include <sys/fcntl.h>
85#include <sys/lockf.h>
86#include <sys/ubc_internal.h>
87#include <sys/attr.h>
88#include <sys/signalvar.h>
89#include <sys/uio_internal.h>
90
91#include <vfs/vfs_support.h>
92
93#include <sys/vm.h>
94
95#include <sys/time.h>
96#include <kern/clock.h>
97#include <libkern/OSAtomic.h>
98
99#include <miscfs/fifofs/fifo.h>
100#include <miscfs/specfs/specdev.h>
101
102#include <nfs/rpcv2.h>
103#include <nfs/nfsproto.h>
104#include <nfs/nfs.h>
105#include <nfs/nfsnode.h>
106#include <nfs/nfs_gss.h>
107#include <nfs/nfsmount.h>
108#include <nfs/nfs_lock.h>
109#include <nfs/xdr_subs.h>
110#include <nfs/nfsm_subs.h>
111
112#include <net/if.h>
113#include <netinet/in.h>
114#include <netinet/in_var.h>
115
116#include <vm/vm_kern.h>
117#include <vm/vm_pageout.h>
118
119#include <kern/task.h>
120#include <kern/sched_prim.h>
121
122#define NFS_VNOP_DBG(...) NFS_DBG(NFS_FAC_VNOP, 7, ## __VA_ARGS__)
123
124/*
125 * NFS vnode ops
126 */
127int	nfs_vnop_lookup(struct vnop_lookup_args *);
128int	nfsspec_vnop_read(struct vnop_read_args *);
129int	nfsspec_vnop_write(struct vnop_write_args *);
130int	nfsspec_vnop_close(struct vnop_close_args *);
131#if FIFO
132int	nfsfifo_vnop_read(struct vnop_read_args *);
133int	nfsfifo_vnop_write(struct vnop_write_args *);
134int	nfsfifo_vnop_close(struct vnop_close_args *);
135#endif
136int	nfs_vnop_ioctl(struct vnop_ioctl_args *);
137int	nfs_vnop_select(struct vnop_select_args *);
138int	nfs_vnop_setattr(struct vnop_setattr_args *);
139int	nfs_vnop_fsync(struct vnop_fsync_args *);
140int	nfs_vnop_rename(struct vnop_rename_args *);
141int	nfs_vnop_readdir(struct vnop_readdir_args *);
142int	nfs_vnop_readlink(struct vnop_readlink_args *);
143int	nfs_vnop_pathconf(struct vnop_pathconf_args *);
144int	nfs_vnop_pagein(struct vnop_pagein_args *);
145int	nfs_vnop_pageout(struct vnop_pageout_args *);
146int	nfs_vnop_blktooff(struct vnop_blktooff_args *);
147int	nfs_vnop_offtoblk(struct vnop_offtoblk_args *);
148int	nfs_vnop_blockmap(struct vnop_blockmap_args *);
149int	nfs_vnop_monitor(struct vnop_monitor_args *);
150
151int	nfs3_vnop_create(struct vnop_create_args *);
152int	nfs3_vnop_mknod(struct vnop_mknod_args *);
153int	nfs3_vnop_getattr(struct vnop_getattr_args *);
154int	nfs3_vnop_link(struct vnop_link_args *);
155int	nfs3_vnop_mkdir(struct vnop_mkdir_args *);
156int	nfs3_vnop_rmdir(struct vnop_rmdir_args *);
157int	nfs3_vnop_symlink(struct vnop_symlink_args *);
158
159vnop_t **nfsv2_vnodeop_p;
160static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = {
161	{ &vnop_default_desc, (vnop_t *)vn_default_error },
162	{ &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup },	/* lookup */
163	{ &vnop_create_desc, (vnop_t *)nfs3_vnop_create },	/* create */
164	{ &vnop_mknod_desc, (vnop_t *)nfs3_vnop_mknod },	/* mknod */
165	{ &vnop_open_desc, (vnop_t *)nfs_vnop_open },		/* open */
166	{ &vnop_close_desc, (vnop_t *)nfs_vnop_close },		/* close */
167	{ &vnop_access_desc, (vnop_t *)nfs_vnop_access },	/* access */
168	{ &vnop_getattr_desc, (vnop_t *)nfs3_vnop_getattr },	/* getattr */
169	{ &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr },	/* setattr */
170	{ &vnop_read_desc, (vnop_t *)nfs_vnop_read },		/* read */
171	{ &vnop_write_desc, (vnop_t *)nfs_vnop_write },		/* write */
172	{ &vnop_ioctl_desc, (vnop_t *)nfs_vnop_ioctl },		/* ioctl */
173	{ &vnop_select_desc, (vnop_t *)nfs_vnop_select },	/* select */
174	{ &vnop_revoke_desc, (vnop_t *)nfs_vnop_revoke },	/* revoke */
175	{ &vnop_mmap_desc, (vnop_t *)nfs_vnop_mmap },		/* mmap */
176	{ &vnop_mnomap_desc, (vnop_t *)nfs_vnop_mnomap },	/* mnomap */
177	{ &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync },		/* fsync */
178	{ &vnop_remove_desc, (vnop_t *)nfs_vnop_remove },	/* remove */
179	{ &vnop_link_desc, (vnop_t *)nfs3_vnop_link },		/* link */
180	{ &vnop_rename_desc, (vnop_t *)nfs_vnop_rename },	/* rename */
181	{ &vnop_mkdir_desc, (vnop_t *)nfs3_vnop_mkdir },	/* mkdir */
182	{ &vnop_rmdir_desc, (vnop_t *)nfs3_vnop_rmdir },	/* rmdir */
183	{ &vnop_symlink_desc, (vnop_t *)nfs3_vnop_symlink },	/* symlink */
184	{ &vnop_readdir_desc, (vnop_t *)nfs_vnop_readdir },	/* readdir */
185	{ &vnop_readlink_desc, (vnop_t *)nfs_vnop_readlink },	/* readlink */
186	{ &vnop_inactive_desc, (vnop_t *)nfs_vnop_inactive },	/* inactive */
187	{ &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim },	/* reclaim */
188	{ &vnop_strategy_desc, (vnop_t *)err_strategy },	/* strategy */
189	{ &vnop_pathconf_desc, (vnop_t *)nfs_vnop_pathconf },	/* pathconf */
190	{ &vnop_advlock_desc, (vnop_t *)nfs_vnop_advlock },	/* advlock */
191	{ &vnop_bwrite_desc, (vnop_t *)err_bwrite },		/* bwrite */
192	{ &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein },	/* Pagein */
193	{ &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout },	/* Pageout */
194	{ &vnop_copyfile_desc, (vnop_t *)err_copyfile },	/* Copyfile */
195	{ &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff },	/* blktooff */
196	{ &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk },	/* offtoblk */
197	{ &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap },	/* blockmap */
198	{ &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },	/* monitor */
199	{ NULL, NULL }
200};
201struct vnodeopv_desc nfsv2_vnodeop_opv_desc =
202	{ &nfsv2_vnodeop_p, nfsv2_vnodeop_entries };
203
204vnop_t **nfsv4_vnodeop_p;
205static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = {
206	{ &vnop_default_desc, (vnop_t *)vn_default_error },
207	{ &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup },	/* lookup */
208	{ &vnop_create_desc, (vnop_t *)nfs4_vnop_create },	/* create */
209	{ &vnop_mknod_desc, (vnop_t *)nfs4_vnop_mknod },	/* mknod */
210	{ &vnop_open_desc, (vnop_t *)nfs_vnop_open },		/* open */
211	{ &vnop_close_desc, (vnop_t *)nfs_vnop_close },		/* close */
212	{ &vnop_access_desc, (vnop_t *)nfs_vnop_access },	/* access */
213	{ &vnop_getattr_desc, (vnop_t *)nfs4_vnop_getattr },	/* getattr */
214	{ &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr },	/* setattr */
215	{ &vnop_read_desc, (vnop_t *)nfs_vnop_read },		/* read */
216	{ &vnop_write_desc, (vnop_t *)nfs_vnop_write },		/* write */
217	{ &vnop_ioctl_desc, (vnop_t *)nfs_vnop_ioctl },		/* ioctl */
218	{ &vnop_select_desc, (vnop_t *)nfs_vnop_select },	/* select */
219	{ &vnop_revoke_desc, (vnop_t *)nfs_vnop_revoke },	/* revoke */
220	{ &vnop_mmap_desc, (vnop_t *)nfs_vnop_mmap },		/* mmap */
221	{ &vnop_mnomap_desc, (vnop_t *)nfs_vnop_mnomap },	/* mnomap */
222	{ &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync },		/* fsync */
223	{ &vnop_remove_desc, (vnop_t *)nfs_vnop_remove },	/* remove */
224	{ &vnop_link_desc, (vnop_t *)nfs4_vnop_link },		/* link */
225	{ &vnop_rename_desc, (vnop_t *)nfs_vnop_rename },	/* rename */
226	{ &vnop_mkdir_desc, (vnop_t *)nfs4_vnop_mkdir },	/* mkdir */
227	{ &vnop_rmdir_desc, (vnop_t *)nfs4_vnop_rmdir },	/* rmdir */
228	{ &vnop_symlink_desc, (vnop_t *)nfs4_vnop_symlink },	/* symlink */
229	{ &vnop_readdir_desc, (vnop_t *)nfs_vnop_readdir },	/* readdir */
230	{ &vnop_readlink_desc, (vnop_t *)nfs_vnop_readlink },	/* readlink */
231	{ &vnop_inactive_desc, (vnop_t *)nfs_vnop_inactive },	/* inactive */
232	{ &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim },	/* reclaim */
233	{ &vnop_strategy_desc, (vnop_t *)err_strategy },	/* strategy */
234	{ &vnop_pathconf_desc, (vnop_t *)nfs_vnop_pathconf },	/* pathconf */
235	{ &vnop_advlock_desc, (vnop_t *)nfs_vnop_advlock },	/* advlock */
236	{ &vnop_bwrite_desc, (vnop_t *)err_bwrite },		/* bwrite */
237	{ &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein },	/* Pagein */
238	{ &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout },	/* Pageout */
239	{ &vnop_copyfile_desc, (vnop_t *)err_copyfile },	/* Copyfile */
240	{ &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff },	/* blktooff */
241	{ &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk },	/* offtoblk */
242	{ &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap },	/* blockmap */
243	{ &vnop_getxattr_desc, (vnop_t *)nfs4_vnop_getxattr },	/* getxattr */
244	{ &vnop_setxattr_desc, (vnop_t *)nfs4_vnop_setxattr },	/* setxattr */
245	{ &vnop_removexattr_desc, (vnop_t *)nfs4_vnop_removexattr },/* removexattr */
246	{ &vnop_listxattr_desc, (vnop_t *)nfs4_vnop_listxattr },/* listxattr */
247#if NAMEDSTREAMS
248	{ &vnop_getnamedstream_desc, (vnop_t *)nfs4_vnop_getnamedstream },	/* getnamedstream */
249	{ &vnop_makenamedstream_desc, (vnop_t *)nfs4_vnop_makenamedstream },	/* makenamedstream */
250	{ &vnop_removenamedstream_desc, (vnop_t *)nfs4_vnop_removenamedstream },/* removenamedstream */
251#endif
252	{ &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },	/* monitor */
253	{ NULL, NULL }
254};
255struct vnodeopv_desc nfsv4_vnodeop_opv_desc =
256	{ &nfsv4_vnodeop_p, nfsv4_vnodeop_entries };
257
258/*
259 * Special device vnode ops
260 */
261vnop_t **spec_nfsv2nodeop_p;
262static struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = {
263	{ &vnop_default_desc, (vnop_t *)vn_default_error },
264	{ &vnop_lookup_desc, (vnop_t *)spec_lookup },		/* lookup */
265	{ &vnop_create_desc, (vnop_t *)spec_create },		/* create */
266	{ &vnop_mknod_desc, (vnop_t *)spec_mknod },		/* mknod */
267	{ &vnop_open_desc, (vnop_t *)spec_open },		/* open */
268	{ &vnop_close_desc, (vnop_t *)nfsspec_vnop_close },	/* close */
269	{ &vnop_getattr_desc, (vnop_t *)nfs3_vnop_getattr },	/* getattr */
270	{ &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr },	/* setattr */
271	{ &vnop_read_desc, (vnop_t *)nfsspec_vnop_read },	/* read */
272	{ &vnop_write_desc, (vnop_t *)nfsspec_vnop_write },	/* write */
273	{ &vnop_ioctl_desc, (vnop_t *)spec_ioctl },		/* ioctl */
274	{ &vnop_select_desc, (vnop_t *)spec_select },		/* select */
275	{ &vnop_revoke_desc, (vnop_t *)spec_revoke },		/* revoke */
276	{ &vnop_mmap_desc, (vnop_t *)spec_mmap },		/* mmap */
277	{ &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync },		/* fsync */
278	{ &vnop_remove_desc, (vnop_t *)spec_remove },		/* remove */
279	{ &vnop_link_desc, (vnop_t *)spec_link },		/* link */
280	{ &vnop_rename_desc, (vnop_t *)spec_rename },		/* rename */
281	{ &vnop_mkdir_desc, (vnop_t *)spec_mkdir },		/* mkdir */
282	{ &vnop_rmdir_desc, (vnop_t *)spec_rmdir },		/* rmdir */
283	{ &vnop_symlink_desc, (vnop_t *)spec_symlink },		/* symlink */
284	{ &vnop_readdir_desc, (vnop_t *)spec_readdir },		/* readdir */
285	{ &vnop_readlink_desc, (vnop_t *)spec_readlink },	/* readlink */
286	{ &vnop_inactive_desc, (vnop_t *)nfs_vnop_inactive },	/* inactive */
287	{ &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim },	/* reclaim */
288	{ &vnop_strategy_desc, (vnop_t *)spec_strategy },	/* strategy */
289	{ &vnop_pathconf_desc, (vnop_t *)spec_pathconf },	/* pathconf */
290	{ &vnop_advlock_desc, (vnop_t *)spec_advlock },		/* advlock */
291	{ &vnop_bwrite_desc, (vnop_t *)vn_bwrite },		/* bwrite */
292	{ &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein },	/* Pagein */
293	{ &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout },	/* Pageout */
294	{ &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff },	/* blktooff */
295	{ &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk },	/* offtoblk */
296	{ &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap },	/* blockmap */
297	{ &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },	/* monitor */
298	{ NULL, NULL }
299};
300struct vnodeopv_desc spec_nfsv2nodeop_opv_desc =
301	{ &spec_nfsv2nodeop_p, spec_nfsv2nodeop_entries };
302vnop_t **spec_nfsv4nodeop_p;
303static struct vnodeopv_entry_desc spec_nfsv4nodeop_entries[] = {
304	{ &vnop_default_desc, (vnop_t *)vn_default_error },
305	{ &vnop_lookup_desc, (vnop_t *)spec_lookup },		/* lookup */
306	{ &vnop_create_desc, (vnop_t *)spec_create },		/* create */
307	{ &vnop_mknod_desc, (vnop_t *)spec_mknod },		/* mknod */
308	{ &vnop_open_desc, (vnop_t *)spec_open },		/* open */
309	{ &vnop_close_desc, (vnop_t *)nfsspec_vnop_close },	/* close */
310	{ &vnop_getattr_desc, (vnop_t *)nfs4_vnop_getattr },	/* getattr */
311	{ &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr },	/* setattr */
312	{ &vnop_read_desc, (vnop_t *)nfsspec_vnop_read },	/* read */
313	{ &vnop_write_desc, (vnop_t *)nfsspec_vnop_write },	/* write */
314	{ &vnop_ioctl_desc, (vnop_t *)spec_ioctl },		/* ioctl */
315	{ &vnop_select_desc, (vnop_t *)spec_select },		/* select */
316	{ &vnop_revoke_desc, (vnop_t *)spec_revoke },		/* revoke */
317	{ &vnop_mmap_desc, (vnop_t *)spec_mmap },		/* mmap */
318	{ &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync },		/* fsync */
319	{ &vnop_remove_desc, (vnop_t *)spec_remove },		/* remove */
320	{ &vnop_link_desc, (vnop_t *)spec_link },		/* link */
321	{ &vnop_rename_desc, (vnop_t *)spec_rename },		/* rename */
322	{ &vnop_mkdir_desc, (vnop_t *)spec_mkdir },		/* mkdir */
323	{ &vnop_rmdir_desc, (vnop_t *)spec_rmdir },		/* rmdir */
324	{ &vnop_symlink_desc, (vnop_t *)spec_symlink },		/* symlink */
325	{ &vnop_readdir_desc, (vnop_t *)spec_readdir },		/* readdir */
326	{ &vnop_readlink_desc, (vnop_t *)spec_readlink },	/* readlink */
327	{ &vnop_inactive_desc, (vnop_t *)nfs_vnop_inactive },	/* inactive */
328	{ &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim },	/* reclaim */
329	{ &vnop_strategy_desc, (vnop_t *)spec_strategy },	/* strategy */
330	{ &vnop_pathconf_desc, (vnop_t *)spec_pathconf },	/* pathconf */
331	{ &vnop_advlock_desc, (vnop_t *)spec_advlock },		/* advlock */
332	{ &vnop_bwrite_desc, (vnop_t *)vn_bwrite },		/* bwrite */
333	{ &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein },	/* Pagein */
334	{ &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout },	/* Pageout */
335	{ &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff },	/* blktooff */
336	{ &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk },	/* offtoblk */
337	{ &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap },	/* blockmap */
338	{ &vnop_getxattr_desc, (vnop_t *)nfs4_vnop_getxattr },	/* getxattr */
339	{ &vnop_setxattr_desc, (vnop_t *)nfs4_vnop_setxattr },	/* setxattr */
340	{ &vnop_removexattr_desc, (vnop_t *)nfs4_vnop_removexattr },/* removexattr */
341	{ &vnop_listxattr_desc, (vnop_t *)nfs4_vnop_listxattr },/* listxattr */
342#if NAMEDSTREAMS
343	{ &vnop_getnamedstream_desc, (vnop_t *)nfs4_vnop_getnamedstream },	/* getnamedstream */
344	{ &vnop_makenamedstream_desc, (vnop_t *)nfs4_vnop_makenamedstream },	/* makenamedstream */
345	{ &vnop_removenamedstream_desc, (vnop_t *)nfs4_vnop_removenamedstream },/* removenamedstream */
346#endif
347	{ &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },	/* monitor */
348	{ NULL, NULL }
349};
350struct vnodeopv_desc spec_nfsv4nodeop_opv_desc =
351	{ &spec_nfsv4nodeop_p, spec_nfsv4nodeop_entries };
352
353#if FIFO
354vnop_t **fifo_nfsv2nodeop_p;
355static struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = {
356	{ &vnop_default_desc, (vnop_t *)vn_default_error },
357	{ &vnop_lookup_desc, (vnop_t *)fifo_lookup },		/* lookup */
358	{ &vnop_create_desc, (vnop_t *)fifo_create },		/* create */
359	{ &vnop_mknod_desc, (vnop_t *)fifo_mknod },		/* mknod */
360	{ &vnop_open_desc, (vnop_t *)fifo_open },		/* open */
361	{ &vnop_close_desc, (vnop_t *)nfsfifo_vnop_close },	/* close */
362	{ &vnop_getattr_desc, (vnop_t *)nfs3_vnop_getattr },	/* getattr */
363	{ &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr },	/* setattr */
364	{ &vnop_read_desc, (vnop_t *)nfsfifo_vnop_read },	/* read */
365	{ &vnop_write_desc, (vnop_t *)nfsfifo_vnop_write },	/* write */
366	{ &vnop_ioctl_desc, (vnop_t *)fifo_ioctl },		/* ioctl */
367	{ &vnop_select_desc, (vnop_t *)fifo_select },		/* select */
368	{ &vnop_revoke_desc, (vnop_t *)fifo_revoke },		/* revoke */
369	{ &vnop_mmap_desc, (vnop_t *)fifo_mmap },		/* mmap */
370	{ &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync },		/* fsync */
371	{ &vnop_remove_desc, (vnop_t *)fifo_remove },		/* remove */
372	{ &vnop_link_desc, (vnop_t *)fifo_link },		/* link */
373	{ &vnop_rename_desc, (vnop_t *)fifo_rename },		/* rename */
374	{ &vnop_mkdir_desc, (vnop_t *)fifo_mkdir },		/* mkdir */
375	{ &vnop_rmdir_desc, (vnop_t *)fifo_rmdir },		/* rmdir */
376	{ &vnop_symlink_desc, (vnop_t *)fifo_symlink },		/* symlink */
377	{ &vnop_readdir_desc, (vnop_t *)fifo_readdir },		/* readdir */
378	{ &vnop_readlink_desc, (vnop_t *)fifo_readlink },	/* readlink */
379	{ &vnop_inactive_desc, (vnop_t *)nfs_vnop_inactive },	/* inactive */
380	{ &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim },	/* reclaim */
381	{ &vnop_strategy_desc, (vnop_t *)fifo_strategy },	/* strategy */
382	{ &vnop_pathconf_desc, (vnop_t *)fifo_pathconf },	/* pathconf */
383	{ &vnop_advlock_desc, (vnop_t *)fifo_advlock },		/* advlock */
384	{ &vnop_bwrite_desc, (vnop_t *)vn_bwrite },		/* bwrite */
385	{ &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein },	/* Pagein */
386	{ &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout },	/* Pageout */
387	{ &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff },	/* blktooff */
388	{ &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk },	/* offtoblk */
389	{ &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap },	/* blockmap */
390	{ &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },	/* monitor */
391	{ NULL, NULL }
392};
393struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc =
394	{ &fifo_nfsv2nodeop_p, fifo_nfsv2nodeop_entries };
395
396vnop_t **fifo_nfsv4nodeop_p;
397static struct vnodeopv_entry_desc fifo_nfsv4nodeop_entries[] = {
398	{ &vnop_default_desc, (vnop_t *)vn_default_error },
399	{ &vnop_lookup_desc, (vnop_t *)fifo_lookup },		/* lookup */
400	{ &vnop_create_desc, (vnop_t *)fifo_create },		/* create */
401	{ &vnop_mknod_desc, (vnop_t *)fifo_mknod },		/* mknod */
402	{ &vnop_open_desc, (vnop_t *)fifo_open },		/* open */
403	{ &vnop_close_desc, (vnop_t *)nfsfifo_vnop_close },	/* close */
404	{ &vnop_getattr_desc, (vnop_t *)nfs4_vnop_getattr },	/* getattr */
405	{ &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr },	/* setattr */
406	{ &vnop_read_desc, (vnop_t *)nfsfifo_vnop_read },	/* read */
407	{ &vnop_write_desc, (vnop_t *)nfsfifo_vnop_write },	/* write */
408	{ &vnop_ioctl_desc, (vnop_t *)fifo_ioctl },		/* ioctl */
409	{ &vnop_select_desc, (vnop_t *)fifo_select },		/* select */
410	{ &vnop_revoke_desc, (vnop_t *)fifo_revoke },		/* revoke */
411	{ &vnop_mmap_desc, (vnop_t *)fifo_mmap },		/* mmap */
412	{ &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync },		/* fsync */
413	{ &vnop_remove_desc, (vnop_t *)fifo_remove },		/* remove */
414	{ &vnop_link_desc, (vnop_t *)fifo_link },		/* link */
415	{ &vnop_rename_desc, (vnop_t *)fifo_rename },		/* rename */
416	{ &vnop_mkdir_desc, (vnop_t *)fifo_mkdir },		/* mkdir */
417	{ &vnop_rmdir_desc, (vnop_t *)fifo_rmdir },		/* rmdir */
418	{ &vnop_symlink_desc, (vnop_t *)fifo_symlink },		/* symlink */
419	{ &vnop_readdir_desc, (vnop_t *)fifo_readdir },		/* readdir */
420	{ &vnop_readlink_desc, (vnop_t *)fifo_readlink },	/* readlink */
421	{ &vnop_inactive_desc, (vnop_t *)nfs_vnop_inactive },	/* inactive */
422	{ &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim },	/* reclaim */
423	{ &vnop_strategy_desc, (vnop_t *)fifo_strategy },	/* strategy */
424	{ &vnop_pathconf_desc, (vnop_t *)fifo_pathconf },	/* pathconf */
425	{ &vnop_advlock_desc, (vnop_t *)fifo_advlock },		/* advlock */
426	{ &vnop_bwrite_desc, (vnop_t *)vn_bwrite },		/* bwrite */
427	{ &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein },	/* Pagein */
428	{ &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout },	/* Pageout */
429	{ &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff },	/* blktooff */
430	{ &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk },	/* offtoblk */
431	{ &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap },	/* blockmap */
432	{ &vnop_getxattr_desc, (vnop_t *)nfs4_vnop_getxattr },	/* getxattr */
433	{ &vnop_setxattr_desc, (vnop_t *)nfs4_vnop_setxattr },	/* setxattr */
434	{ &vnop_removexattr_desc, (vnop_t *)nfs4_vnop_removexattr },/* removexattr */
435	{ &vnop_listxattr_desc, (vnop_t *)nfs4_vnop_listxattr },/* listxattr */
436#if NAMEDSTREAMS
437	{ &vnop_getnamedstream_desc, (vnop_t *)nfs4_vnop_getnamedstream },	/* getnamedstream */
438	{ &vnop_makenamedstream_desc, (vnop_t *)nfs4_vnop_makenamedstream },	/* makenamedstream */
439	{ &vnop_removenamedstream_desc, (vnop_t *)nfs4_vnop_removenamedstream },/* removenamedstream */
440#endif
441	{ &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor },	/* monitor */
442	{ NULL, NULL }
443};
444struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc =
445	{ &fifo_nfsv4nodeop_p, fifo_nfsv4nodeop_entries };
446#endif /* FIFO */
447
448
449int	nfs_sillyrename(nfsnode_t,nfsnode_t,struct componentname *,vfs_context_t);
450
451/*
452 * Find the slot in the access cache for this UID.
453 * If adding and no existing slot is found, reuse slots in FIFO order.
454 * The index of the next slot to use is kept in the last entry of the n_access array.
455 */
456int
457nfs_node_access_slot(nfsnode_t np, uid_t uid, int add)
458{
459	int slot;
460
461	for (slot=0; slot < NFS_ACCESS_CACHE_SIZE; slot++)
462		if (np->n_accessuid[slot] == uid)
463			break;
464	if (slot == NFS_ACCESS_CACHE_SIZE) {
465		if (!add)
466			return (-1);
467		slot = np->n_access[NFS_ACCESS_CACHE_SIZE];
468		np->n_access[NFS_ACCESS_CACHE_SIZE] = (slot + 1) % NFS_ACCESS_CACHE_SIZE;
469	}
470	return (slot);
471}
472
473int
474nfs3_access_rpc(nfsnode_t np, u_int32_t *access, vfs_context_t ctx)
475{
476	int error = 0, lockerror = ENOENT, status, slot;
477	uint32_t access_result = 0;
478	u_int64_t xid;
479	struct nfsm_chain nmreq, nmrep;
480	struct timeval now;
481	uid_t uid;
482
483	nfsm_chain_null(&nmreq);
484	nfsm_chain_null(&nmrep);
485
486	nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(NFS_VER3) + NFSX_UNSIGNED);
487	nfsm_chain_add_fh(error, &nmreq, NFS_VER3, np->n_fhp, np->n_fhsize);
488	nfsm_chain_add_32(error, &nmreq, *access);
489	nfsm_chain_build_done(error, &nmreq);
490	nfsmout_if(error);
491	error = nfs_request(np, NULL, &nmreq, NFSPROC_ACCESS, ctx, NULL, &nmrep, &xid, &status);
492	if ((lockerror = nfs_node_lock(np)))
493		error = lockerror;
494	nfsm_chain_postop_attr_update(error, &nmrep, np, &xid);
495	if (!error)
496		error = status;
497	nfsm_chain_get_32(error, &nmrep, access_result);
498	nfsmout_if(error);
499
500	uid = kauth_cred_getuid(vfs_context_ucred(ctx));
501	slot = nfs_node_access_slot(np, uid, 1);
502	np->n_accessuid[slot] = uid;
503	microuptime(&now);
504	np->n_accessstamp[slot] = now.tv_sec;
505	np->n_access[slot] = access_result;
506
507	/*
508	 * If we asked for DELETE but didn't get it, the server
509	 * may simply not support returning that bit (possible
510	 * on UNIX systems).  So, we'll assume that it is OK,
511	 * and just let any subsequent delete action fail if it
512	 * really isn't deletable.
513	 */
514	if ((*access & NFS_ACCESS_DELETE) &&
515	    !(np->n_access[slot] & NFS_ACCESS_DELETE))
516		np->n_access[slot] |= NFS_ACCESS_DELETE;
517	/* ".zfs" subdirectories may erroneously give a denied answer for add/remove */
518	if (nfs_access_dotzfs && (np->n_flag & NISDOTZFSCHILD))
519		np->n_access[slot] |= (NFS_ACCESS_MODIFY|NFS_ACCESS_EXTEND|NFS_ACCESS_DELETE);
520	/* pass back the access returned with this request */
521	*access = np->n_access[slot];
522nfsmout:
523	if (!lockerror)
524		nfs_node_unlock(np);
525	nfsm_chain_cleanup(&nmreq);
526	nfsm_chain_cleanup(&nmrep);
527	return (error);
528}
529
530/*
531 * See if our mount is in trouble. Note this is inherently racey.
532 */
533static int
534nfs_notresponding(struct nfsmount *nmp)
535{
536	int timeoutmask = NFSSTA_TIMEO | NFSSTA_LOCKTIMEO | NFSSTA_JUKEBOXTIMEO;
537	if (NMFLAG(nmp, MUTEJUKEBOX)) /* jukebox timeouts don't count as unresponsive if muted */
538		   timeoutmask &= ~NFSSTA_JUKEBOXTIMEO;
539
540	return ((nmp->nm_state & timeoutmask) || !(nmp->nm_sockflags & NMSOCK_READY));
541}
542
543/*
544 * NFS access vnode op.
545 * For NFS version 2, just return ok. File accesses may fail later.
546 * For NFS version 3+, use the access RPC to check accessibility. If file
547 * permissions are changed on the server, accesses might still fail later.
548 */
549int
550nfs_vnop_access(
551	struct vnop_access_args /* {
552		struct vnodeop_desc *a_desc;
553		vnode_t a_vp;
554		int a_action;
555		vfs_context_t a_context;
556	} */ *ap)
557{
558	vfs_context_t ctx = ap->a_context;
559	vnode_t vp = ap->a_vp;
560	int error = 0, slot, dorpc;
561	u_int32_t access, waccess;
562	nfsnode_t np = VTONFS(vp);
563	struct nfsmount *nmp;
564	int nfsvers;
565	struct timeval now;
566	uid_t uid;
567
568	nmp = VTONMP(vp);
569	if (!nmp)
570		return (ENXIO);
571	nfsvers = nmp->nm_vers;
572
573	if (nfsvers == NFS_VER2) {
574		if ((ap->a_action & KAUTH_VNODE_WRITE_RIGHTS) &&
575		    vfs_isrdonly(vnode_mount(vp)))
576			return (EROFS);
577		return (0);
578	}
579
580	/*
581	 * For NFS v3, do an access rpc, otherwise you are stuck emulating
582	 * ufs_access() locally using the vattr. This may not be correct,
583	 * since the server may apply other access criteria such as
584	 * client uid-->server uid mapping that we do not know about, but
585	 * this is better than just returning anything that is lying about
586	 * in the cache.
587	 */
588
589	/*
590	 * Convert KAUTH primitives to NFS access rights.
591	 */
592	access = 0;
593	if (vnode_isdir(vp)) {
594		/* directory */
595		if (ap->a_action &
596		    (KAUTH_VNODE_LIST_DIRECTORY |
597		    KAUTH_VNODE_READ_EXTATTRIBUTES))
598			access |= NFS_ACCESS_READ;
599		if (ap->a_action & KAUTH_VNODE_SEARCH)
600			access |= NFS_ACCESS_LOOKUP;
601		if (ap->a_action &
602		    (KAUTH_VNODE_ADD_FILE |
603		    KAUTH_VNODE_ADD_SUBDIRECTORY))
604			access |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
605		if (ap->a_action & KAUTH_VNODE_DELETE_CHILD)
606			access |= NFS_ACCESS_MODIFY;
607	} else {
608		/* file */
609		if (ap->a_action &
610		    (KAUTH_VNODE_READ_DATA |
611		    KAUTH_VNODE_READ_EXTATTRIBUTES))
612			access |= NFS_ACCESS_READ;
613		if (ap->a_action & KAUTH_VNODE_WRITE_DATA)
614			access |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
615		if (ap->a_action & KAUTH_VNODE_APPEND_DATA)
616			access |= NFS_ACCESS_EXTEND;
617		if (ap->a_action & KAUTH_VNODE_EXECUTE)
618			access |= NFS_ACCESS_EXECUTE;
619	}
620	/* common */
621	if (ap->a_action & KAUTH_VNODE_DELETE)
622		access |= NFS_ACCESS_DELETE;
623	if (ap->a_action &
624	    (KAUTH_VNODE_WRITE_ATTRIBUTES |
625	    KAUTH_VNODE_WRITE_EXTATTRIBUTES |
626	    KAUTH_VNODE_WRITE_SECURITY))
627		access |= NFS_ACCESS_MODIFY;
628	/* XXX this is pretty dubious */
629	if (ap->a_action & KAUTH_VNODE_CHANGE_OWNER)
630		access |= NFS_ACCESS_MODIFY;
631
632	/* if caching, always ask for every right */
633	if (nfs_access_cache_timeout > 0) {
634		waccess = NFS_ACCESS_READ | NFS_ACCESS_MODIFY |
635			NFS_ACCESS_EXTEND | NFS_ACCESS_EXECUTE |
636			NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
637	} else {
638		waccess = access;
639	}
640
641	if ((error = nfs_node_lock(np)))
642		return (error);
643
644	/*
645	 * Does our cached result allow us to give a definite yes to
646	 * this request?
647	 */
648	uid = kauth_cred_getuid(vfs_context_ucred(ctx));
649	slot = nfs_node_access_slot(np, uid, 0);
650	dorpc = 1;
651	if (access == 0) {
652		/* not asking for any rights understood by NFS, so don't bother doing an RPC */
653		/* OSAddAtomic(1, &nfsstats.accesscache_hits); */
654		dorpc = 0;
655		waccess = 0;
656	} else if (NACCESSVALID(np, slot)) {
657		/*
658		 * In addition if the kernel is checking for access, i.e.,
659		 * KAUTH_VNODE_ACCESS is not set, and the server does not seem
660		 * to be responding just return if we have something in the
661		 * cache even if its stale for the user. If were granted access
662		 * by the cache and we're a kernel access, then call it good
663		 * enough. We want to avoid having this particular request going
664		 * over the wire causing a hang. This is because at this moment
665		 * we do not know what the state of the server is and what ever
666		 * we get back be it either yea or nay is going to be stale.
667		 * Finder (Desktop services/FileURL) might hang when going over
668		 * the wire when just asking getattrlist for the roots FSID
669		 * since we are going to be called to see if we're authorized
670		 * for search.
671		 *
672		 * N.B. This is also the strategy that SMB is using.
673		 */
674		int granted = ((np->n_access[slot] & access) == access);
675
676		if (!(ap->a_action & KAUTH_VNODE_ACCESS)) {
677			if (granted || nfs_notresponding(nmp)) {
678				dorpc = 0;
679				waccess = np->n_access[slot];
680			}
681		} else {
682			int stale;
683			microuptime(&now);
684			stale = (now.tv_sec >= (np->n_accessstamp[slot] + nfs_access_cache_timeout));
685			if (granted && !stale) {
686			/* OSAddAtomic(1, &nfsstats.accesscache_hits); */
687				dorpc = 0;
688				waccess = np->n_access[slot];
689			}
690		}
691	}
692	nfs_node_unlock(np);
693	if (dorpc) {
694		/* Either a no, or a don't know.  Go to the wire. */
695		/* OSAddAtomic(1, &nfsstats.accesscache_misses); */
696		error = nmp->nm_funcs->nf_access_rpc(np, &waccess, ctx);
697	}
698	if (!error && ((waccess & access) != access))
699		error = EACCES;
700
701	return (error);
702}
703
704
705/*
706 * NFS open vnode op
707 *
708 * Perform various update/invalidation checks and then add the
709 * open to the node.  Regular files will have an open file structure
710 * on the node and, for NFSv4, perform an OPEN request on the server.
711 */
712int
713nfs_vnop_open(
714	struct vnop_open_args /* {
715		struct vnodeop_desc *a_desc;
716		vnode_t a_vp;
717		int a_mode;
718		vfs_context_t a_context;
719	} */ *ap)
720{
721	vfs_context_t ctx = ap->a_context;
722	vnode_t vp = ap->a_vp;
723	nfsnode_t np = VTONFS(vp);
724	struct nfsmount *nmp;
725	int error, accessMode, denyMode, opened = 0;
726	struct nfs_open_owner *noop = NULL;
727	struct nfs_open_file *nofp = NULL;
728	enum vtype vtype;
729
730	if (!(ap->a_mode & (FREAD|FWRITE)))
731		return (EINVAL);
732
733	nmp = VTONMP(vp);
734	if (!nmp)
735		return (ENXIO);
736	if (np->n_flag & NREVOKE)
737		return (EIO);
738
739	vtype = vnode_vtype(vp);
740	if ((vtype != VREG) && (vtype != VDIR) && (vtype != VLNK))
741		return (EACCES);
742
743	/* First, check if we need to update/invalidate */
744	if (ISSET(np->n_flag, NUPDATESIZE))
745		nfs_data_update_size(np, 0);
746	if ((error = nfs_node_lock(np)))
747		return (error);
748	if (np->n_flag & NNEEDINVALIDATE) {
749		np->n_flag &= ~NNEEDINVALIDATE;
750		if (vtype == VDIR)
751			nfs_invaldir(np);
752		nfs_node_unlock(np);
753		nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
754		if ((error = nfs_node_lock(np)))
755			return (error);
756	}
757	if (vtype == VREG)
758		np->n_lastrahead = -1;
759	if (np->n_flag & NMODIFIED) {
760		if (vtype == VDIR)
761			nfs_invaldir(np);
762		nfs_node_unlock(np);
763		if ((error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1)))
764			return (error);
765	} else {
766		nfs_node_unlock(np);
767	}
768
769	/* nfs_getattr() will check changed and purge caches */
770	if ((error = nfs_getattr(np, NULL, ctx, NGA_UNCACHED)))
771		return (error);
772
773	if (vtype != VREG) {
774		/* Just mark that it was opened */
775		lck_mtx_lock(&np->n_openlock);
776		np->n_openrefcnt++;
777		lck_mtx_unlock(&np->n_openlock);
778		return (0);
779	}
780
781	/* mode contains some combination of: FREAD, FWRITE, O_SHLOCK, O_EXLOCK */
782	accessMode = 0;
783	if (ap->a_mode & FREAD)
784		accessMode |= NFS_OPEN_SHARE_ACCESS_READ;
785	if (ap->a_mode & FWRITE)
786		accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE;
787	if (ap->a_mode & O_EXLOCK)
788		denyMode = NFS_OPEN_SHARE_DENY_BOTH;
789	else if (ap->a_mode & O_SHLOCK)
790		denyMode = NFS_OPEN_SHARE_DENY_WRITE;
791	else
792		denyMode = NFS_OPEN_SHARE_DENY_NONE;
793	// XXX don't do deny modes just yet (and never do it for !v4)
794	denyMode = NFS_OPEN_SHARE_DENY_NONE;
795
796	noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1);
797	if (!noop)
798		return (ENOMEM);
799
800restart:
801	error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx));
802	if (error) {
803		nfs_open_owner_rele(noop);
804		return (error);
805	}
806	if (np->n_flag & NREVOKE) {
807		error = EIO;
808		nfs_mount_state_in_use_end(nmp, 0);
809		nfs_open_owner_rele(noop);
810		return (error);
811	}
812
813	error = nfs_open_file_find(np, noop, &nofp, accessMode, denyMode, 1);
814	if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) {
815		NP(np, "nfs_vnop_open: LOST %d", kauth_cred_getuid(nofp->nof_owner->noo_cred));
816		error = EIO;
817	}
818	if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) {
819		nfs_mount_state_in_use_end(nmp, 0);
820		error = nfs4_reopen(nofp, vfs_context_thread(ctx));
821		nofp = NULL;
822		if (!error)
823			goto restart;
824	}
825	if (!error)
826		error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx));
827	if (error) {
828		nofp = NULL;
829		goto out;
830	}
831
832	if (nmp->nm_vers < NFS_VER4) {
833		/*
834		 * NFS v2/v3 opens are always allowed - so just add it.
835		 */
836		nfs_open_file_add_open(nofp, accessMode, denyMode, 0);
837		goto out;
838	}
839
840	/*
841	 * If we just created the file and the modes match, then we simply use
842	 * the open performed in the create.  Otherwise, send the request.
843	 */
844	if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) &&
845	    (nofp->nof_creator == current_thread()) &&
846	    (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) &&
847	    (denyMode == NFS_OPEN_SHARE_DENY_NONE)) {
848		nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE;
849		nofp->nof_creator = NULL;
850	} else {
851		if (!opened)
852			error = nfs4_open(np, nofp, accessMode, denyMode, ctx);
853		if ((error == EACCES) && (nofp->nof_flags & NFS_OPEN_FILE_CREATE) &&
854		    (nofp->nof_creator == current_thread())) {
855			/*
856			 * Ugh.  This can happen if we just created the file with read-only
857			 * perms and we're trying to open it for real with different modes
858			 * (e.g. write-only or with a deny mode) and the server decides to
859			 * not allow the second open because of the read-only perms.
860			 * The best we can do is to just use the create's open.
861			 * We may have access we don't need or we may not have a requested
862			 * deny mode.  We may log complaints later, but we'll try to avoid it.
863			 */
864			if (denyMode != NFS_OPEN_SHARE_DENY_NONE)
865				NP(np, "nfs_vnop_open: deny mode foregone on create, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred));
866			nofp->nof_creator = NULL;
867			error = 0;
868		}
869		if (error)
870			goto out;
871		opened = 1;
872		/*
873		 * If we had just created the file, we already had it open.
874		 * If the actual open mode is less than what we grabbed at
875		 * create time, then we'll downgrade the open here.
876		 */
877		if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) &&
878		    (nofp->nof_creator == current_thread())) {
879			error = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx);
880			if (error)
881				NP(np, "nfs_vnop_open: create close error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred));
882			if (!nfs_mount_state_error_should_restart(error)) {
883				error = 0;
884				nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE;
885			}
886		}
887	}
888
889out:
890	if (nofp)
891		nfs_open_file_clear_busy(nofp);
892	if (nfs_mount_state_in_use_end(nmp, error)) {
893		nofp = NULL;
894		goto restart;
895	}
896	if (error)
897		NP(np, "nfs_vnop_open: error %d, %d", error, kauth_cred_getuid(noop->noo_cred));
898	if (noop)
899		nfs_open_owner_rele(noop);
900	if (!error && vtype == VREG && (ap->a_mode & FWRITE)) {
901		lck_mtx_lock(&nmp->nm_lock);
902		nmp->nm_state &= ~NFSSTA_SQUISHY;
903		nmp->nm_curdeadtimeout = nmp->nm_deadtimeout;
904		if (nmp->nm_curdeadtimeout <= 0)
905			nmp->nm_deadto_start = 0;
906		nmp->nm_writers++;
907		lck_mtx_unlock(&nmp->nm_lock);
908	}
909
910	return (error);
911}
912
913static uint32_t
914nfs_no_of_open_file_writers(nfsnode_t np)
915{
916	uint32_t writers = 0;
917	struct nfs_open_file *nofp;
918
919	TAILQ_FOREACH(nofp,  &np->n_opens, nof_link) {
920		writers += nofp->nof_w + nofp->nof_rw + nofp->nof_w_dw + nofp->nof_rw_dw +
921			nofp->nof_w_drw + nofp->nof_rw_drw + nofp->nof_d_w_dw +
922			nofp->nof_d_rw_dw + nofp->nof_d_w_drw + nofp->nof_d_rw_drw +
923			nofp->nof_d_w + nofp->nof_d_rw;
924	}
925
926	return (writers);
927}
928
929/*
930 * NFS close vnode op
931 *
932 * What an NFS client should do upon close after writing is a debatable issue.
933 * Most NFS clients push delayed writes to the server upon close, basically for
934 * two reasons:
935 * 1 - So that any write errors may be reported back to the client process
936 *     doing the close system call. By far the two most likely errors are
937 *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
938 * 2 - To put a worst case upper bound on cache inconsistency between
939 *     multiple clients for the file.
940 * There is also a consistency problem for Version 2 of the protocol w.r.t.
941 * not being able to tell if other clients are writing a file concurrently,
942 * since there is no way of knowing if the changed modify time in the reply
943 * is only due to the write for this client.
944 * (NFS Version 3 provides weak cache consistency data in the reply that
945 *  should be sufficient to detect and handle this case.)
946 *
947 * The current code does the following:
948 * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
949 * for NFS Version 3 - flush dirty buffers to the server but don't invalidate them.
950 * for NFS Version 4 - basically the same as NFSv3
951 */
952int
953nfs_vnop_close(
954	struct vnop_close_args /* {
955		struct vnodeop_desc *a_desc;
956		vnode_t a_vp;
957		int a_fflag;
958		vfs_context_t a_context;
959	} */ *ap)
960{
961	vfs_context_t ctx = ap->a_context;
962	vnode_t vp = ap->a_vp;
963	nfsnode_t np = VTONFS(vp);
964	struct nfsmount *nmp;
965	int error = 0, error1, nfsvers;
966	int fflag = ap->a_fflag;
967	enum vtype vtype;
968	int accessMode, denyMode;
969	struct nfs_open_owner *noop = NULL;
970	struct nfs_open_file *nofp = NULL;
971
972	nmp = VTONMP(vp);
973	if (!nmp)
974		return (ENXIO);
975	nfsvers = nmp->nm_vers;
976	vtype = vnode_vtype(vp);
977
978	/* First, check if we need to update/flush/invalidate */
979	if (ISSET(np->n_flag, NUPDATESIZE))
980		nfs_data_update_size(np, 0);
981	nfs_node_lock_force(np);
982	if (np->n_flag & NNEEDINVALIDATE) {
983		np->n_flag &= ~NNEEDINVALIDATE;
984		nfs_node_unlock(np);
985		nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
986		nfs_node_lock_force(np);
987	}
988	if ((vtype == VREG) && (np->n_flag & NMODIFIED) && (fflag & FWRITE)) {
989		/* we're closing an open for write and the file is modified, so flush it */
990		nfs_node_unlock(np);
991		if (nfsvers != NFS_VER2)
992			error = nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), 0);
993		else
994			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
995		nfs_node_lock_force(np);
996		NATTRINVALIDATE(np);
997	}
998	if (np->n_flag & NWRITEERR) {
999		np->n_flag &= ~NWRITEERR;
1000		error = np->n_error;
1001	}
1002	nfs_node_unlock(np);
1003
1004	if (vtype != VREG) {
1005		/* Just mark that it was closed */
1006		lck_mtx_lock(&np->n_openlock);
1007		if (np->n_openrefcnt == 0) {
1008			if (fflag & (FREAD|FWRITE)) {
1009				NP(np, "nfs_vnop_close: open reference underrun");
1010				error = EINVAL;
1011			}
1012		} else if (fflag & (FREAD|FWRITE)) {
1013			np->n_openrefcnt--;
1014		} else {
1015			/* No FREAD/FWRITE set - probably the final close */
1016			np->n_openrefcnt = 0;
1017		}
1018		lck_mtx_unlock(&np->n_openlock);
1019		return (error);
1020	}
1021	error1 = error;
1022
1023	/* fflag should contain some combination of: FREAD, FWRITE, FHASLOCK */
1024	accessMode = 0;
1025	if (fflag & FREAD)
1026		accessMode |= NFS_OPEN_SHARE_ACCESS_READ;
1027	if (fflag & FWRITE)
1028		accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE;
1029// XXX It would be nice if we still had the O_EXLOCK/O_SHLOCK flags that were on the open
1030//	if (fflag & O_EXLOCK)
1031//		denyMode = NFS_OPEN_SHARE_DENY_BOTH;
1032//	else if (fflag & O_SHLOCK)
1033//		denyMode = NFS_OPEN_SHARE_DENY_WRITE;
1034//	else
1035//		denyMode = NFS_OPEN_SHARE_DENY_NONE;
1036#if 0  // Not yet
1037	if (fflag & FHASLOCK) {
1038		/* XXX assume FHASLOCK is for the deny mode and not flock */
1039		/* FHASLOCK flock will be unlocked in the close path, but the flag is not cleared. */
1040		if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_READ)
1041			denyMode = NFS_OPEN_SHARE_DENY_BOTH;
1042		else if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_WRITE)
1043			denyMode = NFS_OPEN_SHARE_DENY_WRITE;
1044		else
1045			denyMode = NFS_OPEN_SHARE_DENY_NONE;
1046	} else {
1047			denyMode = NFS_OPEN_SHARE_DENY_NONE;
1048	}
1049#else
1050	// XXX don't do deny modes just yet (and never do it for !v4)
1051	denyMode = NFS_OPEN_SHARE_DENY_NONE;
1052#endif
1053
1054	if (!accessMode) {
1055		/*
1056		 * No mode given to close?
1057		 * Guess this is the final close.
1058		 * We should unlock all locks and close all opens.
1059		 */
1060		uint32_t writers;
1061		mount_t mp = vnode_mount(vp);
1062		int force = (!mp || (mp->mnt_kern_flag & MNTK_FRCUNMOUNT));
1063
1064		writers = nfs_no_of_open_file_writers(np);
1065		nfs_release_open_state_for_node(np, force);
1066		if (writers) {
1067			lck_mtx_lock(&nmp->nm_lock);
1068			if (writers > nmp->nm_writers) {
1069				NP(np, "nfs_vnop_close: number of write opens for mount underrun. Node has %d"
1070				   " opens for write. Mount has total of %d opens for write\n",
1071				   writers, nmp->nm_writers);
1072				nmp->nm_writers = 0;
1073			} else {
1074				nmp->nm_writers -= writers;
1075			}
1076			lck_mtx_unlock(&nmp->nm_lock);
1077		}
1078
1079		return (error);
1080	} else if (fflag & FWRITE) {
1081		lck_mtx_lock(&nmp->nm_lock);
1082		if (nmp->nm_writers == 0) {
1083			NP(np, "nfs_vnop_close: removing open writer from mount, but mount has no files open for writing");
1084		} else {
1085			nmp->nm_writers--;
1086		}
1087		lck_mtx_unlock(&nmp->nm_lock);
1088	}
1089
1090
1091	noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0);
1092	if (!noop) {
1093		// printf("nfs_vnop_close: can't get open owner!\n");
1094		return (EIO);
1095	}
1096
1097restart:
1098	error = nfs_mount_state_in_use_start(nmp, NULL);
1099	if (error) {
1100		nfs_open_owner_rele(noop);
1101		return (error);
1102	}
1103
1104	error = nfs_open_file_find(np, noop, &nofp, 0, 0, 0);
1105	if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) {
1106		nfs_mount_state_in_use_end(nmp, 0);
1107		error = nfs4_reopen(nofp, NULL);
1108		nofp = NULL;
1109		if (!error)
1110			goto restart;
1111	}
1112	if (error) {
1113		NP(np, "nfs_vnop_close: no open file for owner, error %d, %d", error, kauth_cred_getuid(noop->noo_cred));
1114		error = EBADF;
1115		goto out;
1116	}
1117	error = nfs_open_file_set_busy(nofp, NULL);
1118	if (error) {
1119		nofp = NULL;
1120		goto out;
1121	}
1122
1123	error = nfs_close(np, nofp, accessMode, denyMode, ctx);
1124	if (error)
1125		NP(np, "nfs_vnop_close: close error %d, %d", error, kauth_cred_getuid(noop->noo_cred));
1126
1127out:
1128	if (nofp)
1129		nfs_open_file_clear_busy(nofp);
1130	if (nfs_mount_state_in_use_end(nmp, error)) {
1131		nofp = NULL;
1132		goto restart;
1133	}
1134	if (!error)
1135		error = error1;
1136	if (error)
1137		NP(np, "nfs_vnop_close: error %d, %d", error, kauth_cred_getuid(noop->noo_cred));
1138	if (noop)
1139		nfs_open_owner_rele(noop);
1140	return (error);
1141}
1142
1143/*
1144 * nfs_close(): common function that does all the heavy lifting of file closure
1145 *
1146 * Takes an open file structure and a set of access/deny modes and figures out how
1147 * to update the open file structure (and the state on the server) appropriately.
1148 */
1149int
1150nfs_close(
1151	nfsnode_t np,
1152	struct nfs_open_file *nofp,
1153	uint32_t accessMode,
1154	uint32_t denyMode,
1155	vfs_context_t ctx)
1156{
1157	struct nfs_lock_owner *nlop;
1158	int error = 0, changed = 0, delegated = 0, closed = 0, downgrade = 0;
1159	uint32_t newAccessMode, newDenyMode;
1160
1161	/* warn if modes don't match current state */
1162	if (((accessMode & nofp->nof_access) != accessMode) || ((denyMode & nofp->nof_deny) != denyMode))
1163		NP(np, "nfs_close: mode mismatch %d %d, current %d %d, %d",
1164			accessMode, denyMode, nofp->nof_access, nofp->nof_deny,
1165			kauth_cred_getuid(nofp->nof_owner->noo_cred));
1166
1167	/*
1168	 * If we're closing a write-only open, we may not have a write-only count
1169	 * if we also grabbed read access.  So, check the read-write count.
1170	 */
1171	if (denyMode == NFS_OPEN_SHARE_DENY_NONE) {
1172		if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) &&
1173		    (nofp->nof_w == 0) && (nofp->nof_d_w == 0) &&
1174		    (nofp->nof_rw || nofp->nof_d_rw))
1175			accessMode = NFS_OPEN_SHARE_ACCESS_BOTH;
1176	} else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) {
1177		if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) &&
1178		    (nofp->nof_w_dw == 0) && (nofp->nof_d_w_dw == 0) &&
1179		    (nofp->nof_rw_dw || nofp->nof_d_rw_dw))
1180			accessMode = NFS_OPEN_SHARE_ACCESS_BOTH;
1181	} else { /* NFS_OPEN_SHARE_DENY_BOTH */
1182		if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) &&
1183		    (nofp->nof_w_drw == 0) && (nofp->nof_d_w_drw == 0) &&
1184		    (nofp->nof_rw_drw || nofp->nof_d_rw_drw))
1185			accessMode = NFS_OPEN_SHARE_ACCESS_BOTH;
1186	}
1187
1188	nfs_open_file_remove_open_find(nofp, accessMode, denyMode, &newAccessMode, &newDenyMode, &delegated);
1189	if ((newAccessMode != nofp->nof_access) || (newDenyMode != nofp->nof_deny))
1190		changed = 1;
1191	else
1192		changed = 0;
1193
1194	if (NFSTONMP(np)->nm_vers < NFS_VER4) /* NFS v2/v3 closes simply need to remove the open. */
1195		goto v3close;
1196
1197	if ((newAccessMode == 0) || (nofp->nof_opencnt == 1)) {
1198		/*
1199		 * No more access after this close, so clean up and close it.
1200		 * Don't send a close RPC if we're closing a delegated open.
1201		 */
1202		nfs_wait_bufs(np);
1203		closed = 1;
1204		if (!delegated && !(nofp->nof_flags & NFS_OPEN_FILE_LOST))
1205			error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0);
1206		if (error == NFSERR_LOCKS_HELD) {
1207			/*
1208			 * Hmm... the server says we have locks we need to release first
1209			 * Find the lock owner and try to unlock everything.
1210			 */
1211			nlop = nfs_lock_owner_find(np, vfs_context_proc(ctx), 0);
1212			if (nlop) {
1213				nfs4_unlock_rpc(np, nlop, F_WRLCK, 0, UINT64_MAX,
1214					0, vfs_context_thread(ctx), vfs_context_ucred(ctx));
1215				nfs_lock_owner_rele(nlop);
1216			}
1217			error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0);
1218		}
1219	} else if (changed) {
1220		/*
1221		 * File is still open but with less access, so downgrade the open.
1222		 * Don't send a downgrade RPC if we're closing a delegated open.
1223		 */
1224		if (!delegated && !(nofp->nof_flags & NFS_OPEN_FILE_LOST)) {
1225			downgrade = 1;
1226			/*
1227			 * If we have delegated opens, we should probably claim them before sending
1228			 * the downgrade because the server may not know the open we are downgrading to.
1229			 */
1230			if (nofp->nof_d_rw_drw || nofp->nof_d_w_drw || nofp->nof_d_r_drw ||
1231			    nofp->nof_d_rw_dw || nofp->nof_d_w_dw || nofp->nof_d_r_dw ||
1232			    nofp->nof_d_rw || nofp->nof_d_w || nofp->nof_d_r)
1233				nfs4_claim_delegated_state_for_open_file(nofp, 0);
1234			/* need to remove the open before sending the downgrade */
1235			nfs_open_file_remove_open(nofp, accessMode, denyMode);
1236			error = nfs4_open_downgrade_rpc(np, nofp, ctx);
1237			if (error) /* Hmm.. that didn't work. Add the open back in. */
1238				nfs_open_file_add_open(nofp, accessMode, denyMode, delegated);
1239		}
1240	}
1241
1242	if (error) {
1243		NP(np, "nfs_close: error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred));
1244		return (error);
1245	}
1246
1247v3close:
1248	if (!downgrade)
1249		nfs_open_file_remove_open(nofp, accessMode, denyMode);
1250
1251	if (closed) {
1252		lck_mtx_lock(&nofp->nof_lock);
1253		if (nofp->nof_r || nofp->nof_d_r || nofp->nof_w || nofp->nof_d_w || nofp->nof_d_rw ||
1254		    (nofp->nof_rw && !((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && !nofp->nof_creator && (nofp->nof_rw == 1))) ||
1255		    nofp->nof_r_dw || nofp->nof_d_r_dw || nofp->nof_w_dw || nofp->nof_d_w_dw ||
1256		    nofp->nof_rw_dw || nofp->nof_d_rw_dw || nofp->nof_r_drw || nofp->nof_d_r_drw ||
1257		    nofp->nof_w_drw || nofp->nof_d_w_drw || nofp->nof_rw_drw || nofp->nof_d_rw_drw)
1258			NP(np, "nfs_close: unexpected count: %u.%u %u.%u %u.%u dw %u.%u %u.%u %u.%u drw %u.%u %u.%u %u.%u flags 0x%x, %d",
1259				nofp->nof_r, nofp->nof_d_r, nofp->nof_w, nofp->nof_d_w,
1260				nofp->nof_rw, nofp->nof_d_rw, nofp->nof_r_dw, nofp->nof_d_r_dw,
1261				nofp->nof_w_dw, nofp->nof_d_w_dw, nofp->nof_rw_dw, nofp->nof_d_rw_dw,
1262				nofp->nof_r_drw, nofp->nof_d_r_drw, nofp->nof_w_drw, nofp->nof_d_w_drw,
1263				nofp->nof_rw_drw, nofp->nof_d_rw_drw, nofp->nof_flags,
1264				kauth_cred_getuid(nofp->nof_owner->noo_cred));
1265		/* clear out all open info, just to be safe */
1266		nofp->nof_access = nofp->nof_deny = 0;
1267		nofp->nof_mmap_access = nofp->nof_mmap_deny = 0;
1268		nofp->nof_r = nofp->nof_d_r = 0;
1269		nofp->nof_w = nofp->nof_d_w = 0;
1270		nofp->nof_rw = nofp->nof_d_rw = 0;
1271		nofp->nof_r_dw = nofp->nof_d_r_dw = 0;
1272		nofp->nof_w_dw = nofp->nof_d_w_dw = 0;
1273		nofp->nof_rw_dw = nofp->nof_d_rw_dw = 0;
1274		nofp->nof_r_drw = nofp->nof_d_r_drw = 0;
1275		nofp->nof_w_drw = nofp->nof_d_w_drw = 0;
1276		nofp->nof_rw_drw = nofp->nof_d_rw_drw = 0;
1277		nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE;
1278		lck_mtx_unlock(&nofp->nof_lock);
1279		/* XXX we may potentially want to clean up idle/unused open file structures */
1280	}
1281	if (nofp->nof_flags & NFS_OPEN_FILE_LOST) {
1282		error = EIO;
1283		NP(np, "nfs_close: LOST%s, %d", !nofp->nof_opencnt ? " (last)" : "",
1284			kauth_cred_getuid(nofp->nof_owner->noo_cred));
1285	}
1286
1287	return (error);
1288}
1289
1290
1291
1292
1293int
1294nfs3_getattr_rpc(
1295	nfsnode_t np,
1296	mount_t mp,
1297	u_char *fhp,
1298	size_t fhsize,
1299	int flags,
1300	vfs_context_t ctx,
1301	struct nfs_vattr *nvap,
1302	u_int64_t *xidp)
1303{
1304	struct nfsmount *nmp = mp ? VFSTONFS(mp) : NFSTONMP(np);
1305	int error = 0, status, nfsvers, rpcflags = 0;
1306	struct nfsm_chain nmreq, nmrep;
1307
1308	if (!nmp)
1309		return (ENXIO);
1310	nfsvers = nmp->nm_vers;
1311
1312	if (flags & NGA_MONITOR) /* vnode monitor requests should be soft */
1313		rpcflags = R_RECOVER;
1314
1315	nfsm_chain_null(&nmreq);
1316	nfsm_chain_null(&nmrep);
1317
1318	nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers));
1319	if (nfsvers != NFS_VER2)
1320		nfsm_chain_add_32(error, &nmreq, fhsize);
1321	nfsm_chain_add_opaque(error, &nmreq, fhp, fhsize);
1322	nfsm_chain_build_done(error, &nmreq);
1323	nfsmout_if(error);
1324	error = nfs_request2(np, mp, &nmreq, NFSPROC_GETATTR,
1325			vfs_context_thread(ctx), vfs_context_ucred(ctx),
1326			NULL, rpcflags, &nmrep, xidp, &status);
1327	if (!error)
1328		error = status;
1329	nfsmout_if(error);
1330	error = nfs_parsefattr(&nmrep, nfsvers, nvap);
1331nfsmout:
1332	nfsm_chain_cleanup(&nmreq);
1333	nfsm_chain_cleanup(&nmrep);
1334	return (error);
1335}
1336
1337
1338int
1339nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int flags)
1340{
1341	struct nfsmount *nmp;
1342	int error = 0, nfsvers, inprogset = 0, wanted = 0, avoidfloods;
1343	struct nfs_vattr nvattr;
1344	struct timespec ts = { 2, 0 };
1345	u_int64_t xid;
1346
1347	FSDBG_TOP(513, np->n_size, np, np->n_vattr.nva_size, np->n_flag);
1348
1349	if (!(nmp = NFSTONMP(np)))
1350		return (ENXIO);
1351	nfsvers = nmp->nm_vers;
1352
1353	if (!nvap)
1354		nvap = &nvattr;
1355	NVATTR_INIT(nvap);
1356
1357	/* Update local times for special files. */
1358	if (np->n_flag & (NACC | NUPD)) {
1359		nfs_node_lock_force(np);
1360		np->n_flag |= NCHG;
1361		nfs_node_unlock(np);
1362	}
1363	/* Update size, if necessary */
1364	if (ISSET(np->n_flag, NUPDATESIZE))
1365		nfs_data_update_size(np, 0);
1366
1367	error = nfs_node_lock(np);
1368	nfsmout_if(error);
1369	if (!(flags & (NGA_UNCACHED|NGA_MONITOR)) || ((nfsvers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK))) {
1370		/*
1371		 * Use the cache or wait for any getattr in progress if:
1372		 * - it's a cached request, or
1373		 * - we have a delegation
1374		 */
1375		while (1) {
1376			error = nfs_getattrcache(np, nvap, flags);
1377			if (!error || (error != ENOENT)) {
1378				nfs_node_unlock(np);
1379				goto nfsmout;
1380			}
1381			error = 0;
1382			if (!ISSET(np->n_flag, NGETATTRINPROG))
1383				break;
1384			if (flags & NGA_MONITOR) {
1385				/* no need to wait if a request is pending */
1386				error = EINPROGRESS;
1387				nfs_node_unlock(np);
1388				goto nfsmout;
1389			}
1390			SET(np->n_flag, NGETATTRWANT);
1391			msleep(np, &np->n_lock, PZERO-1, "nfsgetattrwant", &ts);
1392			if ((error = nfs_sigintr(NFSTONMP(np), NULL, vfs_context_thread(ctx), 0))) {
1393				nfs_node_unlock(np);
1394				goto nfsmout;
1395			}
1396		}
1397		SET(np->n_flag, NGETATTRINPROG);
1398		inprogset = 1;
1399	} else if (!ISSET(np->n_flag, NGETATTRINPROG)) {
1400		SET(np->n_flag, NGETATTRINPROG);
1401		inprogset = 1;
1402	} else if (flags & NGA_MONITOR) {
1403		/* no need to make a request if one is pending */
1404		error = EINPROGRESS;
1405	}
1406	nfs_node_unlock(np);
1407
1408	nmp = NFSTONMP(np);
1409	if (!nmp)
1410		error = ENXIO;
1411	if (error)
1412		goto nfsmout;
1413
1414	/*
1415	 * We might want to try to get both the attributes and access info by
1416	 * making an ACCESS call and seeing if it returns updated attributes.
1417	 * But don't bother if we aren't caching access info or if the
1418	 * attributes returned wouldn't be cached.
1419	 */
1420	if (!(flags & NGA_ACL) && (nfsvers != NFS_VER2) && nfs_access_for_getattr && (nfs_access_cache_timeout > 0)) {
1421		if (nfs_attrcachetimeout(np) > 0) {
1422			/*  OSAddAtomic(1, &nfsstats.accesscache_misses); */
1423			u_int32_t access = NFS_ACCESS_ALL;
1424			error = nmp->nm_funcs->nf_access_rpc(np, &access, ctx);
1425			if (error)
1426				goto nfsmout;
1427			nfs_node_lock_force(np);
1428			error = nfs_getattrcache(np, nvap, flags);
1429			nfs_node_unlock(np);
1430			if (!error || (error != ENOENT))
1431				goto nfsmout;
1432			/* Well, that didn't work... just do a getattr... */
1433			error = 0;
1434		}
1435	}
1436
1437	avoidfloods = 0;
1438tryagain:
1439	error = nmp->nm_funcs->nf_getattr_rpc(np, NULL, np->n_fhp, np->n_fhsize, flags, ctx, nvap, &xid);
1440	if (!error) {
1441		nfs_node_lock_force(np);
1442		error = nfs_loadattrcache(np, nvap, &xid, 0);
1443		nfs_node_unlock(np);
1444	}
1445	nfsmout_if(error);
1446	if (!xid) { /* out-of-order rpc - attributes were dropped */
1447		FSDBG(513, -1, np, np->n_xid >> 32, np->n_xid);
1448		if (avoidfloods++ < 20)
1449			goto tryagain;
1450		/* avoidfloods>1 is bizarre.  at 20 pull the plug */
1451		/* just return the last attributes we got */
1452	}
1453nfsmout:
1454	nfs_node_lock_force(np);
1455	if (inprogset) {
1456		wanted = ISSET(np->n_flag, NGETATTRWANT);
1457		CLR(np->n_flag, (NGETATTRINPROG | NGETATTRWANT));
1458	}
1459	if (!error) {
1460		/* check if the node changed on us */
1461		vnode_t vp = NFSTOV(np);
1462		enum vtype vtype = vnode_vtype(vp);
1463		if ((vtype == VDIR) && NFS_CHANGED_NC(nfsvers, np, nvap)) {
1464			FSDBG(513, -1, np, 0, np);
1465			np->n_flag &= ~NNEGNCENTRIES;
1466			cache_purge(vp);
1467			np->n_ncgen++;
1468			NFS_CHANGED_UPDATE_NC(nfsvers, np, nvap);
1469			NFS_VNOP_DBG("Purge directory 0x%llx\n",
1470			      (uint64_t)VM_KERNEL_ADDRPERM(vp));
1471		}
1472		if (NFS_CHANGED(nfsvers, np, nvap)) {
1473			FSDBG(513, -1, np, -1, np);
1474			if (vtype == VDIR) {
1475				NFS_VNOP_DBG("Invalidate directory 0x%llx\n",
1476			               (uint64_t)VM_KERNEL_ADDRPERM(vp));
1477				nfs_invaldir(np);
1478			}
1479			nfs_node_unlock(np);
1480			if (wanted)
1481				wakeup(np);
1482			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
1483			FSDBG(513, -1, np, -2, error);
1484			if (!error) {
1485				nfs_node_lock_force(np);
1486				NFS_CHANGED_UPDATE(nfsvers, np, nvap);
1487				nfs_node_unlock(np);
1488			}
1489		} else {
1490			nfs_node_unlock(np);
1491			if (wanted)
1492				wakeup(np);
1493		}
1494	} else {
1495		nfs_node_unlock(np);
1496		if (wanted)
1497			wakeup(np);
1498	}
1499
1500	if (nvap == &nvattr) {
1501		NVATTR_CLEANUP(nvap);
1502	} else if (!(flags & NGA_ACL)) {
1503		/* make sure we don't return an ACL if it wasn't asked for */
1504		NFS_BITMAP_CLR(nvap->nva_bitmap, NFS_FATTR_ACL);
1505		if (nvap->nva_acl) {
1506			kauth_acl_free(nvap->nva_acl);
1507			nvap->nva_acl = NULL;
1508		}
1509	}
1510	FSDBG_BOT(513, np->n_size, error, np->n_vattr.nva_size, np->n_flag);
1511	return (error);
1512}
1513
1514/*
1515 * NFS getattr call from vfs.
1516 */
1517
1518/*
1519 * The attributes we support over the wire.
1520 * We also get fsid but the vfs layer gets it out of the mount
1521 * structure after this calling us so there's no need to return it,
1522 * and Finder expects to call getattrlist just looking for the FSID
1523 * with out hanging on a non responsive server.
1524 */
1525#define NFS3_SUPPORTED_VATTRS \
1526	(VNODE_ATTR_va_rdev |		\
1527	 VNODE_ATTR_va_nlink |		\
1528	 VNODE_ATTR_va_data_size |	\
1529	 VNODE_ATTR_va_data_alloc |	\
1530	 VNODE_ATTR_va_uid |		\
1531	 VNODE_ATTR_va_gid |		\
1532	 VNODE_ATTR_va_mode |		\
1533	 VNODE_ATTR_va_modify_time |	\
1534	 VNODE_ATTR_va_change_time |	\
1535	 VNODE_ATTR_va_access_time |	\
1536	 VNODE_ATTR_va_fileid |		\
1537	 VNODE_ATTR_va_type)
1538
1539int
1540nfs3_vnop_getattr(
1541	struct vnop_getattr_args /* {
1542		struct vnodeop_desc *a_desc;
1543		vnode_t a_vp;
1544		struct vnode_attr *a_vap;
1545		vfs_context_t a_context;
1546	} */ *ap)
1547{
1548	int error;
1549	struct nfs_vattr nva;
1550	struct vnode_attr *vap = ap->a_vap;
1551	dev_t rdev;
1552
1553	/*
1554	 * Lets don't go over the wire if we don't support any of the attributes.
1555	 * Just fall through at the VFS layer and let it cons up what it needs.
1556	 */
1557	/* Return the io size no matter what, since we don't go over the wire for this */
1558	VATTR_RETURN(vap, va_iosize, nfs_iosize);
1559	if ((vap->va_active & NFS3_SUPPORTED_VATTRS) == 0)
1560		return (0);
1561
1562	if (VATTR_IS_ACTIVE(ap->a_vap, va_name))
1563	    NFS_VNOP_DBG("Getting attrs for 0x%llx, vname is %s\n",
1564	          (uint64_t)VM_KERNEL_ADDRPERM(ap->a_vp),
1565	          ap->a_vp->v_name ? ap->a_vp->v_name : "empty");
1566	error = nfs_getattr(VTONFS(ap->a_vp), &nva, ap->a_context, NGA_CACHED);
1567	if (error)
1568		return (error);
1569
1570	/* copy nva to *a_vap */
1571	VATTR_RETURN(vap, va_type, nva.nva_type);
1572	VATTR_RETURN(vap, va_mode, nva.nva_mode);
1573	rdev = makedev(nva.nva_rawdev.specdata1, nva.nva_rawdev.specdata2);
1574	VATTR_RETURN(vap, va_rdev, rdev);
1575	VATTR_RETURN(vap, va_uid, nva.nva_uid);
1576	VATTR_RETURN(vap, va_gid, nva.nva_gid);
1577	VATTR_RETURN(vap, va_nlink, nva.nva_nlink);
1578	VATTR_RETURN(vap, va_fileid, nva.nva_fileid);
1579	VATTR_RETURN(vap, va_data_size, nva.nva_size);
1580	VATTR_RETURN(vap, va_data_alloc, nva.nva_bytes);
1581	vap->va_access_time.tv_sec = nva.nva_timesec[NFSTIME_ACCESS];
1582	vap->va_access_time.tv_nsec = nva.nva_timensec[NFSTIME_ACCESS];
1583	VATTR_SET_SUPPORTED(vap, va_access_time);
1584	vap->va_modify_time.tv_sec = nva.nva_timesec[NFSTIME_MODIFY];
1585	vap->va_modify_time.tv_nsec = nva.nva_timensec[NFSTIME_MODIFY];
1586	VATTR_SET_SUPPORTED(vap, va_modify_time);
1587	vap->va_change_time.tv_sec = nva.nva_timesec[NFSTIME_CHANGE];
1588	vap->va_change_time.tv_nsec = nva.nva_timensec[NFSTIME_CHANGE];
1589	VATTR_SET_SUPPORTED(vap, va_change_time);
1590
1591	// VATTR_RETURN(vap, va_encoding, 0xffff /* kTextEncodingUnknown */);
1592	return (error);
1593}
1594
1595/*
1596 * NFS setattr call.
1597 */
1598int
1599nfs_vnop_setattr(
1600	struct vnop_setattr_args /* {
1601		struct vnodeop_desc *a_desc;
1602		vnode_t a_vp;
1603		struct vnode_attr *a_vap;
1604		vfs_context_t a_context;
1605	} */ *ap)
1606{
1607	vfs_context_t ctx = ap->a_context;
1608	vnode_t vp = ap->a_vp;
1609	nfsnode_t np = VTONFS(vp);
1610	struct nfsmount *nmp;
1611	struct vnode_attr *vap = ap->a_vap;
1612	int error = 0;
1613	int biosize, nfsvers, namedattrs;
1614	u_quad_t origsize, vapsize;
1615	struct nfs_dulookup dul;
1616	nfsnode_t dnp = NULL;
1617	vnode_t dvp = NULL;
1618	const char *vname = NULL;
1619	struct nfs_open_owner *noop = NULL;
1620	struct nfs_open_file *nofp = NULL;
1621
1622	nmp = VTONMP(vp);
1623	if (!nmp)
1624		return (ENXIO);
1625	nfsvers = nmp->nm_vers;
1626	namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR);
1627	biosize = nmp->nm_biosize;
1628
1629	/* Disallow write attempts if the filesystem is mounted read-only. */
1630	if (vnode_vfsisrdonly(vp))
1631		return (EROFS);
1632
1633	origsize = np->n_size;
1634	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
1635		switch (vnode_vtype(vp)) {
1636		case VDIR:
1637			return (EISDIR);
1638		case VCHR:
1639		case VBLK:
1640		case VSOCK:
1641		case VFIFO:
1642			if (!VATTR_IS_ACTIVE(vap, va_modify_time) &&
1643			    !VATTR_IS_ACTIVE(vap, va_access_time) &&
1644			    !VATTR_IS_ACTIVE(vap, va_mode) &&
1645			    !VATTR_IS_ACTIVE(vap, va_uid) &&
1646			    !VATTR_IS_ACTIVE(vap, va_gid)) {
1647				return (0);
1648			}
1649			VATTR_CLEAR_ACTIVE(vap, va_data_size);
1650			break;
1651		default:
1652			/*
1653			 * Disallow write attempts if the filesystem is
1654			 * mounted read-only.
1655			 */
1656			if (vnode_vfsisrdonly(vp))
1657				return (EROFS);
1658			FSDBG_TOP(512, np->n_size, vap->va_data_size,
1659				  np->n_vattr.nva_size, np->n_flag);
1660			/* clear NNEEDINVALIDATE, if set */
1661			if ((error = nfs_node_lock(np)))
1662				return (error);
1663			if (np->n_flag & NNEEDINVALIDATE)
1664				np->n_flag &= ~NNEEDINVALIDATE;
1665			nfs_node_unlock(np);
1666			/* flush everything */
1667			error = nfs_vinvalbuf(vp, (vap->va_data_size ? V_SAVE : 0) , ctx, 1);
1668			if (error) {
1669				NP(np, "nfs_setattr: nfs_vinvalbuf %d", error);
1670				FSDBG_BOT(512, np->n_size, vap->va_data_size, np->n_vattr.nva_size, -1);
1671				return (error);
1672			}
1673			if (nfsvers >= NFS_VER4) {
1674				/* setting file size requires having the file open for write access */
1675				if (np->n_flag & NREVOKE)
1676					return (EIO);
1677				noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1);
1678				if (!noop)
1679					return (ENOMEM);
1680restart:
1681				error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx));
1682				if (error)
1683					return (error);
1684				if (np->n_flag & NREVOKE) {
1685					nfs_mount_state_in_use_end(nmp, 0);
1686					return (EIO);
1687				}
1688				error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1);
1689				if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST))
1690					error = EIO;
1691				if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) {
1692					nfs_mount_state_in_use_end(nmp, 0);
1693					error = nfs4_reopen(nofp, vfs_context_thread(ctx));
1694					nofp = NULL;
1695					if (!error)
1696						goto restart;
1697				}
1698				if (!error)
1699					error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx));
1700				if (error) {
1701					nfs_open_owner_rele(noop);
1702					return (error);
1703				}
1704				if (!(nofp->nof_access & NFS_OPEN_SHARE_ACCESS_WRITE)) {
1705					/* we don't have the file open for write access, so open it */
1706					error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, ctx);
1707					if (!error)
1708						nofp->nof_flags |= NFS_OPEN_FILE_SETATTR;
1709					if (nfs_mount_state_error_should_restart(error)) {
1710						nfs_open_file_clear_busy(nofp);
1711						nofp = NULL;
1712						if (nfs_mount_state_in_use_end(nmp, error))
1713							goto restart;
1714					}
1715				}
1716			}
1717			nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE);
1718			if (np->n_size > vap->va_data_size) { /* shrinking? */
1719				daddr64_t obn, bn;
1720				int neweofoff, mustwrite;
1721				struct nfsbuf *bp;
1722
1723				obn = (np->n_size - 1) / biosize;
1724				bn = vap->va_data_size / biosize;
1725				for ( ; obn >= bn; obn--) {
1726					if (!nfs_buf_is_incore(np, obn))
1727						continue;
1728					error = nfs_buf_get(np, obn, biosize, NULL, NBLK_READ, &bp);
1729					if (error)
1730						continue;
1731					if (obn != bn) {
1732						FSDBG(512, bp, bp->nb_flags, 0, obn);
1733						SET(bp->nb_flags, NB_INVAL);
1734						nfs_buf_release(bp, 1);
1735						continue;
1736					}
1737					mustwrite = 0;
1738					neweofoff = vap->va_data_size - NBOFF(bp);
1739					/* check for any dirty data before the new EOF */
1740					if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < neweofoff)) {
1741						/* clip dirty range to EOF */
1742						if (bp->nb_dirtyend > neweofoff) {
1743							bp->nb_dirtyend = neweofoff;
1744							if (bp->nb_dirtyoff >= bp->nb_dirtyend)
1745								bp->nb_dirtyoff = bp->nb_dirtyend = 0;
1746						}
1747						if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < neweofoff))
1748							mustwrite++;
1749					}
1750					bp->nb_dirty &= (1 << round_page_32(neweofoff)/PAGE_SIZE) - 1;
1751					if (bp->nb_dirty)
1752						mustwrite++;
1753					if (!mustwrite) {
1754						FSDBG(512, bp, bp->nb_flags, 0, obn);
1755						SET(bp->nb_flags, NB_INVAL);
1756						nfs_buf_release(bp, 1);
1757						continue;
1758					}
1759					/* gotta write out dirty data before invalidating */
1760					/* (NB_STABLE indicates that data writes should be FILESYNC) */
1761					/* (NB_NOCACHE indicates buffer should be discarded) */
1762					CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC | NB_READ));
1763					SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
1764					if (!IS_VALID_CRED(bp->nb_wcred)) {
1765						kauth_cred_t cred = vfs_context_ucred(ctx);
1766						kauth_cred_ref(cred);
1767						bp->nb_wcred = cred;
1768					}
1769					error = nfs_buf_write(bp);
1770					// Note: bp has been released
1771					if (error) {
1772						FSDBG(512, bp, 0xd00dee, 0xbad, error);
1773						nfs_node_lock_force(np);
1774						np->n_error = error;
1775						np->n_flag |= NWRITEERR;
1776						/*
1777						 * There was a write error and we need to
1778						 * invalidate attrs and flush buffers in
1779						 * order to sync up with the server.
1780						 * (if this write was extending the file,
1781						 * we may no longer know the correct size)
1782						 */
1783						NATTRINVALIDATE(np);
1784						nfs_node_unlock(np);
1785						nfs_data_unlock(np);
1786						nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
1787						nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE);
1788						error = 0;
1789					}
1790				}
1791			}
1792			if (vap->va_data_size != np->n_size)
1793				ubc_setsize(vp, (off_t)vap->va_data_size); /* XXX error? */
1794			origsize = np->n_size;
1795			np->n_size = np->n_vattr.nva_size = vap->va_data_size;
1796			nfs_node_lock_force(np);
1797			CLR(np->n_flag, NUPDATESIZE);
1798			nfs_node_unlock(np);
1799			FSDBG(512, np, np->n_size, np->n_vattr.nva_size, 0xf00d0001);
1800		}
1801	} else if (VATTR_IS_ACTIVE(vap, va_modify_time) ||
1802		    VATTR_IS_ACTIVE(vap, va_access_time) ||
1803		    (vap->va_vaflags & VA_UTIMES_NULL)) {
1804		if ((error = nfs_node_lock(np)))
1805			return (error);
1806		if ((np->n_flag & NMODIFIED) && (vnode_vtype(vp) == VREG)) {
1807			nfs_node_unlock(np);
1808			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
1809			if (error == EINTR)
1810				return (error);
1811		} else {
1812			nfs_node_unlock(np);
1813		}
1814	}
1815	if ((VATTR_IS_ACTIVE(vap, va_mode) || VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid) ||
1816	     VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid)) &&
1817	    !(error = nfs_node_lock(np))) {
1818		NACCESSINVALIDATE(np);
1819		nfs_node_unlock(np);
1820		if (!namedattrs) {
1821			dvp = vnode_getparent(vp);
1822			vname = vnode_getname(vp);
1823			dnp = (dvp && vname) ? VTONFS(dvp) : NULL;
1824			if (dnp) {
1825				error = nfs_node_set_busy(dnp, vfs_context_thread(ctx));
1826				if (error) {
1827					dnp = NULL;
1828					error = 0;
1829				}
1830			}
1831			if (dnp) {
1832				nfs_dulookup_init(&dul, dnp, vname, strlen(vname), ctx);
1833				nfs_dulookup_start(&dul, dnp, ctx);
1834			}
1835		}
1836	}
1837
1838	if (!error)
1839		error = nmp->nm_funcs->nf_setattr_rpc(np, vap, ctx);
1840
1841	if (VATTR_IS_ACTIVE(vap, va_mode) || VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid) ||
1842	    VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid)) {
1843		if (!namedattrs) {
1844			if (dnp) {
1845				nfs_dulookup_finish(&dul, dnp, ctx);
1846				nfs_node_clear_busy(dnp);
1847			}
1848			if (dvp != NULLVP)
1849				vnode_put(dvp);
1850			if (vname != NULL)
1851				vnode_putname(vname);
1852		}
1853	}
1854
1855	FSDBG_BOT(512, np->n_size, vap->va_data_size, np->n_vattr.nva_size, error);
1856	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
1857		if (error && (origsize != np->n_size) &&
1858		    ((nfsvers < NFS_VER4) || !nfs_mount_state_error_should_restart(error))) {
1859			/* make every effort to resync file size w/ server... */
1860			/* (don't bother if we'll be restarting the operation) */
1861			int err; /* preserve "error" for return */
1862			np->n_size = np->n_vattr.nva_size = origsize;
1863			nfs_node_lock_force(np);
1864			CLR(np->n_flag, NUPDATESIZE);
1865			nfs_node_unlock(np);
1866			FSDBG(512, np, np->n_size, np->n_vattr.nva_size, 0xf00d0002);
1867			ubc_setsize(vp, (off_t)np->n_size); /* XXX check error */
1868			vapsize = vap->va_data_size;
1869			vap->va_data_size = origsize;
1870			err = nmp->nm_funcs->nf_setattr_rpc(np, vap, ctx);
1871			if (err)
1872				NP(np, "nfs_vnop_setattr: nfs%d_setattr_rpc %d %d", nfsvers, error, err);
1873			vap->va_data_size = vapsize;
1874		}
1875		nfs_node_lock_force(np);
1876		/*
1877		 * The size was just set.  If the size is already marked for update, don't
1878		 * trust the newsize (it may have been set while the setattr was in progress).
1879		 * Clear the update flag and make sure we fetch new attributes so we are sure
1880		 * we have the latest size.
1881		 */
1882		if (ISSET(np->n_flag, NUPDATESIZE)) {
1883			CLR(np->n_flag, NUPDATESIZE);
1884			NATTRINVALIDATE(np);
1885			nfs_node_unlock(np);
1886			nfs_getattr(np, NULL, ctx, NGA_UNCACHED);
1887		} else {
1888			nfs_node_unlock(np);
1889		}
1890		nfs_data_unlock(np);
1891		if (nfsvers >= NFS_VER4) {
1892			if (nofp) {
1893				/* don't close our setattr open if we'll be restarting... */
1894				if (!nfs_mount_state_error_should_restart(error) &&
1895				    (nofp->nof_flags & NFS_OPEN_FILE_SETATTR)) {
1896					int err = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, ctx);
1897					if (err)
1898						NP(np, "nfs_vnop_setattr: close error: %d", err);
1899					nofp->nof_flags &= ~NFS_OPEN_FILE_SETATTR;
1900				}
1901				nfs_open_file_clear_busy(nofp);
1902				nofp = NULL;
1903			}
1904			if (nfs_mount_state_in_use_end(nmp, error))
1905				goto restart;
1906			nfs_open_owner_rele(noop);
1907		}
1908	}
1909	return (error);
1910}
1911
1912/*
1913 * Do an NFS setattr RPC.
1914 */
1915int
1916nfs3_setattr_rpc(
1917	nfsnode_t np,
1918	struct vnode_attr *vap,
1919	vfs_context_t ctx)
1920{
1921	struct nfsmount *nmp = NFSTONMP(np);
1922	int error = 0, lockerror = ENOENT, status, wccpostattr = 0, nfsvers;
1923	u_int64_t xid, nextxid;
1924	struct nfsm_chain nmreq, nmrep;
1925
1926	if (!nmp)
1927		return (ENXIO);
1928	nfsvers = nmp->nm_vers;
1929
1930	VATTR_SET_SUPPORTED(vap, va_mode);
1931	VATTR_SET_SUPPORTED(vap, va_uid);
1932	VATTR_SET_SUPPORTED(vap, va_gid);
1933	VATTR_SET_SUPPORTED(vap, va_data_size);
1934	VATTR_SET_SUPPORTED(vap, va_access_time);
1935	VATTR_SET_SUPPORTED(vap, va_modify_time);
1936
1937	if (VATTR_IS_ACTIVE(vap, va_flags)) {
1938		if (vap->va_flags) {	/* we don't support setting flags */
1939			if (vap->va_active & ~VNODE_ATTR_va_flags)
1940				return (EINVAL);	/* return EINVAL if other attributes also set */
1941			else
1942				return (ENOTSUP);	/* return ENOTSUP for chflags(2) */
1943		}
1944		/* no flags set, so we'll just ignore it */
1945		if (!(vap->va_active & ~VNODE_ATTR_va_flags))
1946			return (0); /* no (other) attributes to set, so nothing to do */
1947	}
1948
1949	nfsm_chain_null(&nmreq);
1950	nfsm_chain_null(&nmrep);
1951
1952	nfsm_chain_build_alloc_init(error, &nmreq,
1953		NFSX_FH(nfsvers) + NFSX_SATTR(nfsvers));
1954	nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize);
1955	if (nfsvers == NFS_VER3) {
1956		if (VATTR_IS_ACTIVE(vap, va_mode)) {
1957			nfsm_chain_add_32(error, &nmreq, TRUE);
1958			nfsm_chain_add_32(error, &nmreq, vap->va_mode);
1959		} else {
1960			nfsm_chain_add_32(error, &nmreq, FALSE);
1961		}
1962		if (VATTR_IS_ACTIVE(vap, va_uid)) {
1963			nfsm_chain_add_32(error, &nmreq, TRUE);
1964			nfsm_chain_add_32(error, &nmreq, vap->va_uid);
1965		} else {
1966			nfsm_chain_add_32(error, &nmreq, FALSE);
1967		}
1968		if (VATTR_IS_ACTIVE(vap, va_gid)) {
1969			nfsm_chain_add_32(error, &nmreq, TRUE);
1970			nfsm_chain_add_32(error, &nmreq, vap->va_gid);
1971		} else {
1972			nfsm_chain_add_32(error, &nmreq, FALSE);
1973		}
1974		if (VATTR_IS_ACTIVE(vap, va_data_size)) {
1975			nfsm_chain_add_32(error, &nmreq, TRUE);
1976			nfsm_chain_add_64(error, &nmreq, vap->va_data_size);
1977		} else {
1978			nfsm_chain_add_32(error, &nmreq, FALSE);
1979		}
1980		if (vap->va_vaflags & VA_UTIMES_NULL) {
1981			nfsm_chain_add_32(error, &nmreq, NFS_TIME_SET_TO_SERVER);
1982			nfsm_chain_add_32(error, &nmreq, NFS_TIME_SET_TO_SERVER);
1983		} else {
1984			if (VATTR_IS_ACTIVE(vap, va_access_time)) {
1985				nfsm_chain_add_32(error, &nmreq, NFS_TIME_SET_TO_CLIENT);
1986				nfsm_chain_add_32(error, &nmreq, vap->va_access_time.tv_sec);
1987				nfsm_chain_add_32(error, &nmreq, vap->va_access_time.tv_nsec);
1988			} else {
1989				nfsm_chain_add_32(error, &nmreq, NFS_TIME_DONT_CHANGE);
1990			}
1991			if (VATTR_IS_ACTIVE(vap, va_modify_time)) {
1992				nfsm_chain_add_32(error, &nmreq, NFS_TIME_SET_TO_CLIENT);
1993				nfsm_chain_add_32(error, &nmreq, vap->va_modify_time.tv_sec);
1994				nfsm_chain_add_32(error, &nmreq, vap->va_modify_time.tv_nsec);
1995			} else {
1996				nfsm_chain_add_32(error, &nmreq, NFS_TIME_DONT_CHANGE);
1997			}
1998		}
1999		nfsm_chain_add_32(error, &nmreq, FALSE);
2000	} else {
2001		nfsm_chain_add_32(error, &nmreq, VATTR_IS_ACTIVE(vap, va_mode) ?
2002			vtonfsv2_mode(vnode_vtype(NFSTOV(np)), vap->va_mode) : -1);
2003		nfsm_chain_add_32(error, &nmreq, VATTR_IS_ACTIVE(vap, va_uid) ?
2004			vap->va_uid : (uint32_t)-1);
2005		nfsm_chain_add_32(error, &nmreq, VATTR_IS_ACTIVE(vap, va_gid) ?
2006			vap->va_gid : (uint32_t)-1);
2007		nfsm_chain_add_32(error, &nmreq, VATTR_IS_ACTIVE(vap, va_data_size) ?
2008			vap->va_data_size : (uint32_t)-1);
2009		if (VATTR_IS_ACTIVE(vap, va_access_time)) {
2010			nfsm_chain_add_32(error, &nmreq, vap->va_access_time.tv_sec);
2011			nfsm_chain_add_32(error, &nmreq, (vap->va_access_time.tv_nsec != -1) ?
2012				((uint32_t)vap->va_access_time.tv_nsec / 1000) : 0xffffffff);
2013		} else {
2014			nfsm_chain_add_32(error, &nmreq, -1);
2015			nfsm_chain_add_32(error, &nmreq, -1);
2016		}
2017		if (VATTR_IS_ACTIVE(vap, va_modify_time)) {
2018			nfsm_chain_add_32(error, &nmreq, vap->va_modify_time.tv_sec);
2019			nfsm_chain_add_32(error, &nmreq, (vap->va_modify_time.tv_nsec != -1) ?
2020				((uint32_t)vap->va_modify_time.tv_nsec / 1000) : 0xffffffff);
2021		} else {
2022			nfsm_chain_add_32(error, &nmreq, -1);
2023			nfsm_chain_add_32(error, &nmreq, -1);
2024		}
2025	}
2026	nfsm_chain_build_done(error, &nmreq);
2027	nfsmout_if(error);
2028	error = nfs_request(np, NULL, &nmreq, NFSPROC_SETATTR, ctx, NULL, &nmrep, &xid, &status);
2029	if ((lockerror = nfs_node_lock(np)))
2030		error = lockerror;
2031	if (nfsvers == NFS_VER3) {
2032		struct timespec premtime = { 0, 0 };
2033		nfsm_chain_get_wcc_data(error, &nmrep, np, &premtime, &wccpostattr, &xid);
2034		nfsmout_if(error);
2035		/* if file hadn't changed, update cached mtime */
2036		if (nfstimespeccmp(&np->n_mtime, &premtime, ==))
2037			NFS_CHANGED_UPDATE(nfsvers, np, &np->n_vattr);
2038		/* if directory hadn't changed, update namecache mtime */
2039		if ((vnode_vtype(NFSTOV(np)) == VDIR) &&
2040		    nfstimespeccmp(&np->n_ncmtime, &premtime, ==))
2041			NFS_CHANGED_UPDATE_NC(nfsvers, np, &np->n_vattr);
2042		if (!wccpostattr)
2043			NATTRINVALIDATE(np);
2044		error = status;
2045	} else {
2046		if (!error)
2047			error = status;
2048		nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid);
2049	}
2050	/*
2051	 * We just changed the attributes and we want to make sure that we
2052	 * see the latest attributes.  Get the next XID.  If it's not the
2053	 * next XID after the SETATTR XID, then it's possible that another
2054	 * RPC was in flight at the same time and it might put stale attributes
2055	 * in the cache.  In that case, we invalidate the attributes and set
2056	 * the attribute cache XID to guarantee that newer attributes will
2057	 * get loaded next.
2058	 */
2059	nextxid = 0;
2060	nfs_get_xid(&nextxid);
2061	if (nextxid != (xid + 1)) {
2062		np->n_xid = nextxid;
2063		NATTRINVALIDATE(np);
2064	}
2065nfsmout:
2066	if (!lockerror)
2067		nfs_node_unlock(np);
2068	nfsm_chain_cleanup(&nmreq);
2069	nfsm_chain_cleanup(&nmrep);
2070	return (error);
2071}
2072
2073/*
2074 * NFS lookup call, one step at a time...
2075 * First look in cache
2076 * If not found, unlock the directory nfsnode and do the RPC
2077 */
2078int
2079nfs_vnop_lookup(
2080	struct vnop_lookup_args /* {
2081		struct vnodeop_desc *a_desc;
2082		vnode_t a_dvp;
2083		vnode_t *a_vpp;
2084		struct componentname *a_cnp;
2085		vfs_context_t a_context;
2086	} */ *ap)
2087{
2088	vfs_context_t ctx = ap->a_context;
2089	struct componentname *cnp = ap->a_cnp;
2090	vnode_t dvp = ap->a_dvp;
2091	vnode_t *vpp = ap->a_vpp;
2092	int flags = cnp->cn_flags;
2093	vnode_t newvp;
2094	nfsnode_t dnp, np;
2095	struct nfsmount *nmp;
2096	mount_t mp;
2097	int nfsvers, error, busyerror = ENOENT, isdot, isdotdot, negnamecache;
2098	u_int64_t xid;
2099	struct nfs_vattr nvattr;
2100	int ngflags;
2101	struct vnop_access_args naa;
2102	fhandle_t fh;
2103	struct nfsreq rq, *req = &rq;
2104
2105	*vpp = NULLVP;
2106
2107	dnp = VTONFS(dvp);
2108	NVATTR_INIT(&nvattr);
2109
2110	mp = vnode_mount(dvp);
2111	nmp = VFSTONFS(mp);
2112	if (!nmp) {
2113		error = ENXIO;
2114		goto error_return;
2115	}
2116	nfsvers = nmp->nm_vers;
2117	negnamecache = !NMFLAG(nmp, NONEGNAMECACHE);
2118
2119	if ((error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx))))
2120		goto error_return;
2121	/* nfs_getattr() will check changed and purge caches */
2122	if ((error = nfs_getattr(dnp, NULL, ctx, NGA_CACHED)))
2123		goto error_return;
2124
2125	error = cache_lookup(dvp, vpp, cnp);
2126	switch (error) {
2127	case ENOENT:
2128		/* negative cache entry */
2129		goto error_return;
2130	case 0:
2131		/* cache miss */
2132		if ((nfsvers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) {
2133			/* if rdirplus, try dir buf cache lookup */
2134			error = nfs_dir_buf_cache_lookup(dnp, &np, cnp, ctx, 0);
2135			if (!error && np) {
2136				/* dir buf cache hit */
2137				*vpp = NFSTOV(np);
2138				error = -1;
2139			}
2140		}
2141		if (error != -1) /* cache miss */
2142			break;
2143		/* FALLTHROUGH */
2144	case -1:
2145		/* cache hit, not really an error */
2146		OSAddAtomic64(1, &nfsstats.lookupcache_hits);
2147
2148		nfs_node_clear_busy(dnp);
2149		busyerror = ENOENT;
2150
2151		/* check for directory access */
2152		naa.a_desc = &vnop_access_desc;
2153		naa.a_vp = dvp;
2154		naa.a_action = KAUTH_VNODE_SEARCH;
2155		naa.a_context = ctx;
2156
2157		/* compute actual success/failure based on accessibility */
2158		error = nfs_vnop_access(&naa);
2159		/* FALLTHROUGH */
2160	default:
2161		/* unexpected error from cache_lookup */
2162		goto error_return;
2163	}
2164
2165	/* skip lookup, if we know who we are: "." or ".." */
2166	isdot = isdotdot = 0;
2167	if (cnp->cn_nameptr[0] == '.') {
2168		if (cnp->cn_namelen == 1)
2169			isdot = 1;
2170		if ((cnp->cn_namelen == 2) && (cnp->cn_nameptr[1] == '.'))
2171			isdotdot = 1;
2172	}
2173	if (isdotdot || isdot) {
2174		fh.fh_len = 0;
2175		goto found;
2176	}
2177	if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) {
2178		/* we should never be looking things up in a trigger directory, return nothing */
2179		error = ENOENT;
2180		goto error_return;
2181	}
2182
2183	/* do we know this name is too long? */
2184	nmp = VTONMP(dvp);
2185	if (!nmp) {
2186		error = ENXIO;
2187		goto error_return;
2188	}
2189	if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXNAME) &&
2190	     (cnp->cn_namelen > (int)nmp->nm_fsattr.nfsa_maxname)) {
2191		error = ENAMETOOLONG;
2192		goto error_return;
2193	}
2194
2195	error = 0;
2196	newvp = NULLVP;
2197
2198	OSAddAtomic64(1, &nfsstats.lookupcache_misses);
2199
2200	error = nmp->nm_funcs->nf_lookup_rpc_async(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &req);
2201	nfsmout_if(error);
2202	error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, req, &xid, &fh, &nvattr);
2203	nfsmout_if(error);
2204
2205	/* is the file handle the same as this directory's file handle? */
2206	isdot = NFS_CMPFH(dnp, fh.fh_data, fh.fh_len);
2207
2208found:
2209	if (flags & ISLASTCN) {
2210		switch (cnp->cn_nameiop) {
2211		case DELETE:
2212			cnp->cn_flags &= ~MAKEENTRY;
2213			break;
2214		case RENAME:
2215			cnp->cn_flags &= ~MAKEENTRY;
2216			if (isdot) {
2217				error = EISDIR;
2218				goto error_return;
2219			}
2220			break;
2221		}
2222	}
2223
2224	if (isdotdot) {
2225		newvp = vnode_getparent(dvp);
2226		if (!newvp) {
2227			error = ENOENT;
2228			goto error_return;
2229		}
2230	} else if (isdot) {
2231		error = vnode_get(dvp);
2232		if (error)
2233			goto error_return;
2234		newvp = dvp;
2235		nfs_node_lock_force(dnp);
2236		if (fh.fh_len && (dnp->n_xid <= xid))
2237			nfs_loadattrcache(dnp, &nvattr, &xid, 0);
2238		nfs_node_unlock(dnp);
2239	} else {
2240		ngflags = (cnp->cn_flags & MAKEENTRY) ? NG_MAKEENTRY : 0;
2241		error = nfs_nget(mp, dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, ngflags, &np);
2242		if (error)
2243			goto error_return;
2244		newvp = NFSTOV(np);
2245		nfs_node_unlock(np);
2246	}
2247	*vpp = newvp;
2248
2249nfsmout:
2250	if (error) {
2251		if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) &&
2252		    (flags & ISLASTCN) && (error == ENOENT)) {
2253			if (vnode_mount(dvp) && vnode_vfsisrdonly(dvp))
2254				error = EROFS;
2255			else
2256				error = EJUSTRETURN;
2257		}
2258	}
2259	if ((error == ENOENT) && (cnp->cn_flags & MAKEENTRY) &&
2260	    (cnp->cn_nameiop != CREATE) && negnamecache) {
2261		/* add a negative entry in the name cache */
2262		nfs_node_lock_force(dnp);
2263		cache_enter(dvp, NULL, cnp);
2264		dnp->n_flag |= NNEGNCENTRIES;
2265		nfs_node_unlock(dnp);
2266	}
2267error_return:
2268	NVATTR_CLEANUP(&nvattr);
2269	if (!busyerror)
2270		nfs_node_clear_busy(dnp);
2271	if (error && *vpp) {
2272	        vnode_put(*vpp);
2273		*vpp = NULLVP;
2274	}
2275	return (error);
2276}
2277
2278/*
2279 * NFS readlink call
2280 */
2281int
2282nfs_vnop_readlink(
2283	struct vnop_readlink_args /* {
2284		struct vnodeop_desc *a_desc;
2285		vnode_t a_vp;
2286		struct uio *a_uio;
2287		vfs_context_t a_context;
2288	} */ *ap)
2289{
2290	vfs_context_t ctx = ap->a_context;
2291	nfsnode_t np = VTONFS(ap->a_vp);
2292	struct nfsmount *nmp;
2293	int error = 0, nfsvers;
2294	uint32_t buflen;
2295	uio_t uio = ap->a_uio;
2296	struct nfsbuf *bp = NULL;
2297
2298	if (vnode_vtype(ap->a_vp) != VLNK)
2299		return (EPERM);
2300
2301	if (uio_resid(uio) == 0)
2302		return (0);
2303	if (uio_offset(uio) < 0)
2304		return (EINVAL);
2305
2306	nmp = VTONMP(ap->a_vp);
2307	if (!nmp)
2308		return (ENXIO);
2309	nfsvers = nmp->nm_vers;
2310
2311	/* nfs_getattr() will check changed and purge caches */
2312	if ((error = nfs_getattr(np, NULL, ctx, NGA_CACHED))) {
2313		FSDBG(531, np, 0xd1e0001, 0, error);
2314		return (error);
2315	}
2316
2317	OSAddAtomic64(1, &nfsstats.biocache_readlinks);
2318	error = nfs_buf_get(np, 0, NFS_MAXPATHLEN, vfs_context_thread(ctx), NBLK_READ, &bp);
2319	if (error) {
2320		FSDBG(531, np, 0xd1e0002, 0, error);
2321		return (error);
2322	}
2323	if (!ISSET(bp->nb_flags, NB_CACHE)) {
2324		OSAddAtomic64(1, &nfsstats.readlink_bios);
2325		buflen = bp->nb_bufsize;
2326		error = nmp->nm_funcs->nf_readlink_rpc(np, bp->nb_data, &buflen, ctx);
2327		if (error) {
2328			SET(bp->nb_flags, NB_ERROR);
2329			bp->nb_error = error;
2330		} else {
2331			bp->nb_validoff = 0;
2332			bp->nb_validend = buflen;
2333		}
2334	}
2335	if (!error && (bp->nb_validend > 0))
2336		error = uiomove(bp->nb_data, bp->nb_validend, uio);
2337	FSDBG(531, np, bp->nb_validend, 0, error);
2338	nfs_buf_release(bp, 1);
2339	return (error);
2340}
2341
2342/*
2343 * Do a readlink RPC.
2344 */
2345int
2346nfs3_readlink_rpc(nfsnode_t np, char *buf, uint32_t *buflenp, vfs_context_t ctx)
2347{
2348	struct nfsmount *nmp;
2349	int error = 0, lockerror = ENOENT, nfsvers, status;
2350	uint32_t len;
2351	u_int64_t xid;
2352	struct nfsm_chain nmreq, nmrep;
2353
2354	nmp = NFSTONMP(np);
2355	if (!nmp)
2356		return (ENXIO);
2357	nfsvers = nmp->nm_vers;
2358	nfsm_chain_null(&nmreq);
2359	nfsm_chain_null(&nmrep);
2360
2361	nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers));
2362	nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize);
2363	nfsm_chain_build_done(error, &nmreq);
2364	nfsmout_if(error);
2365	error = nfs_request(np, NULL, &nmreq, NFSPROC_READLINK, ctx, NULL, &nmrep, &xid, &status);
2366	if ((lockerror = nfs_node_lock(np)))
2367		error = lockerror;
2368	if (nfsvers == NFS_VER3)
2369		nfsm_chain_postop_attr_update(error, &nmrep, np, &xid);
2370	if (!error)
2371		error = status;
2372	nfsm_chain_get_32(error, &nmrep, len);
2373	nfsmout_if(error);
2374	if ((nfsvers == NFS_VER2) && (len > *buflenp)) {
2375		error = EBADRPC;
2376		goto nfsmout;
2377	}
2378	if (len >= *buflenp) {
2379		if (np->n_size && (np->n_size < *buflenp))
2380			len = np->n_size;
2381		else
2382			len = *buflenp - 1;
2383	}
2384	nfsm_chain_get_opaque(error, &nmrep, len, buf);
2385	if (!error)
2386		*buflenp = len;
2387nfsmout:
2388	if (!lockerror)
2389		nfs_node_unlock(np);
2390	nfsm_chain_cleanup(&nmreq);
2391	nfsm_chain_cleanup(&nmrep);
2392	return (error);
2393}
2394
2395/*
2396 * NFS read RPC call
2397 * Ditto above
2398 */
2399int
2400nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx)
2401{
2402	struct nfsmount *nmp;
2403	int error = 0, nfsvers, eof = 0;
2404	size_t nmrsize, len, retlen;
2405	user_ssize_t tsiz;
2406	off_t txoffset;
2407	struct nfsreq rq, *req = &rq;
2408	uint32_t stategenid = 0, restart = 0;
2409
2410	FSDBG_TOP(536, np, uio_offset(uio), uio_resid(uio), 0);
2411	nmp = NFSTONMP(np);
2412	if (!nmp)
2413		return (ENXIO);
2414	nfsvers = nmp->nm_vers;
2415	nmrsize = nmp->nm_rsize;
2416
2417	txoffset = uio_offset(uio);
2418	tsiz = uio_resid(uio);
2419	if ((nfsvers == NFS_VER2) && ((uint64_t)(txoffset + tsiz) > 0xffffffffULL)) {
2420		FSDBG_BOT(536, np, uio_offset(uio), uio_resid(uio), EFBIG);
2421		return (EFBIG);
2422	}
2423
2424	while (tsiz > 0) {
2425		len = retlen = (tsiz > (user_ssize_t)nmrsize) ? nmrsize : (size_t)tsiz;
2426		FSDBG(536, np, txoffset, len, 0);
2427		if (np->n_flag & NREVOKE) {
2428			error = EIO;
2429			break;
2430		}
2431		if (nmp->nm_vers >= NFS_VER4)
2432			stategenid = nmp->nm_stategenid;
2433		error = nmp->nm_funcs->nf_read_rpc_async(np, txoffset, len,
2434				vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req);
2435		if (!error)
2436			error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, uio, &retlen, &eof);
2437		if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) &&
2438		    (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */
2439			lck_mtx_lock(&nmp->nm_lock);
2440			if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) {
2441				NP(np, "nfs_read_rpc: error %d, initiating recovery", error);
2442				nfs_need_recover(nmp, error);
2443			}
2444			lck_mtx_unlock(&nmp->nm_lock);
2445			if (np->n_flag & NREVOKE) {
2446				error = EIO;
2447			} else {
2448				if (error == NFSERR_GRACE)
2449					tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
2450				if (!(error = nfs_mount_state_wait_for_recovery(nmp)))
2451					continue;
2452			}
2453		}
2454		if (error)
2455			break;
2456		txoffset += retlen;
2457		tsiz -= retlen;
2458		if (nfsvers != NFS_VER2) {
2459			if (eof || (retlen == 0))
2460				tsiz = 0;
2461		} else if (retlen < len)
2462			tsiz = 0;
2463	}
2464
2465	FSDBG_BOT(536, np, eof, uio_resid(uio), error);
2466	return (error);
2467}
2468
2469int
2470nfs3_read_rpc_async(
2471	nfsnode_t np,
2472	off_t offset,
2473	size_t len,
2474	thread_t thd,
2475	kauth_cred_t cred,
2476	struct nfsreq_cbinfo *cb,
2477	struct nfsreq **reqp)
2478{
2479	struct nfsmount *nmp;
2480	int error = 0, nfsvers;
2481	struct nfsm_chain nmreq;
2482
2483	nmp = NFSTONMP(np);
2484	if (!nmp)
2485		return (ENXIO);
2486	nfsvers = nmp->nm_vers;
2487
2488	nfsm_chain_null(&nmreq);
2489	nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers) + 3 * NFSX_UNSIGNED);
2490	nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize);
2491	if (nfsvers == NFS_VER3) {
2492		nfsm_chain_add_64(error, &nmreq, offset);
2493		nfsm_chain_add_32(error, &nmreq, len);
2494	} else {
2495		nfsm_chain_add_32(error, &nmreq, offset);
2496		nfsm_chain_add_32(error, &nmreq, len);
2497		nfsm_chain_add_32(error, &nmreq, 0);
2498	}
2499	nfsm_chain_build_done(error, &nmreq);
2500	nfsmout_if(error);
2501	error = nfs_request_async(np, NULL, &nmreq, NFSPROC_READ, thd, cred, NULL, 0, cb, reqp);
2502nfsmout:
2503	nfsm_chain_cleanup(&nmreq);
2504	return (error);
2505}
2506
2507int
2508nfs3_read_rpc_async_finish(
2509	nfsnode_t np,
2510	struct nfsreq *req,
2511	uio_t uio,
2512	size_t *lenp,
2513	int *eofp)
2514{
2515	int error = 0, lockerror, nfsvers, status, eof = 0;
2516	size_t retlen = 0;
2517	uint64_t xid;
2518	struct nfsmount *nmp;
2519	struct nfsm_chain nmrep;
2520
2521	nmp = NFSTONMP(np);
2522	if (!nmp) {
2523		nfs_request_async_cancel(req);
2524		return (ENXIO);
2525	}
2526	nfsvers = nmp->nm_vers;
2527
2528	nfsm_chain_null(&nmrep);
2529
2530	error = nfs_request_async_finish(req, &nmrep, &xid, &status);
2531	if (error == EINPROGRESS) /* async request restarted */
2532		return (error);
2533
2534	if ((lockerror = nfs_node_lock(np)))
2535		error = lockerror;
2536	if (nfsvers == NFS_VER3)
2537		nfsm_chain_postop_attr_update(error, &nmrep, np, &xid);
2538	if (!error)
2539		error = status;
2540	if (nfsvers == NFS_VER3) {
2541		nfsm_chain_adv(error, &nmrep, NFSX_UNSIGNED);
2542		nfsm_chain_get_32(error, &nmrep, eof);
2543	} else {
2544		nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid);
2545	}
2546	if (!lockerror)
2547		nfs_node_unlock(np);
2548	nfsm_chain_get_32(error, &nmrep, retlen);
2549	if ((nfsvers == NFS_VER2) && (retlen > *lenp))
2550		error = EBADRPC;
2551	nfsmout_if(error);
2552	error = nfsm_chain_get_uio(&nmrep, MIN(retlen, *lenp), uio);
2553	if (eofp) {
2554		if (nfsvers == NFS_VER3) {
2555			if (!eof && !retlen)
2556				eof = 1;
2557		} else if (retlen < *lenp) {
2558			eof = 1;
2559		}
2560		*eofp = eof;
2561	}
2562	*lenp = MIN(retlen, *lenp);
2563nfsmout:
2564	nfsm_chain_cleanup(&nmrep);
2565	return (error);
2566}
2567
2568/*
2569 * NFS write call
2570 */
2571int
2572nfs_vnop_write(
2573	struct vnop_write_args /* {
2574		struct vnodeop_desc *a_desc;
2575		vnode_t a_vp;
2576		struct uio *a_uio;
2577		int a_ioflag;
2578		vfs_context_t a_context;
2579	} */ *ap)
2580{
2581	vfs_context_t ctx = ap->a_context;
2582	uio_t uio = ap->a_uio;
2583	vnode_t vp = ap->a_vp;
2584	nfsnode_t np = VTONFS(vp);
2585	int ioflag = ap->a_ioflag;
2586	struct nfsbuf *bp;
2587	struct nfsmount *nmp = VTONMP(vp);
2588	daddr64_t lbn;
2589	int biosize;
2590	int n, on, error = 0;
2591	off_t boff, start, end;
2592	uio_t auio;
2593	char auio_buf [ UIO_SIZEOF(1) ];
2594	thread_t thd;
2595	kauth_cred_t cred;
2596
2597	FSDBG_TOP(515, np, uio_offset(uio), uio_resid(uio), ioflag);
2598
2599	if (vnode_vtype(vp) != VREG) {
2600		FSDBG_BOT(515, np, uio_offset(uio), uio_resid(uio), EIO);
2601		return (EIO);
2602	}
2603
2604	thd = vfs_context_thread(ctx);
2605	cred = vfs_context_ucred(ctx);
2606
2607	nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2608
2609	if ((error = nfs_node_lock(np))) {
2610		nfs_data_unlock(np);
2611		FSDBG_BOT(515, np, uio_offset(uio), uio_resid(uio), error);
2612		return (error);
2613	}
2614	np->n_wrbusy++;
2615
2616	if (np->n_flag & NWRITEERR) {
2617		error = np->n_error;
2618		np->n_flag &= ~NWRITEERR;
2619	}
2620	if (np->n_flag & NNEEDINVALIDATE) {
2621		np->n_flag &= ~NNEEDINVALIDATE;
2622		nfs_node_unlock(np);
2623		nfs_data_unlock(np);
2624		nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
2625		nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2626	} else {
2627		nfs_node_unlock(np);
2628	}
2629	if (error)
2630		goto out;
2631
2632	biosize = nmp->nm_biosize;
2633
2634	if (ioflag & (IO_APPEND | IO_SYNC)) {
2635		nfs_node_lock_force(np);
2636		if (np->n_flag & NMODIFIED) {
2637			NATTRINVALIDATE(np);
2638			nfs_node_unlock(np);
2639			nfs_data_unlock(np);
2640			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
2641			nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2642			if (error) {
2643				FSDBG(515, np, uio_offset(uio), 0x10bad01, error);
2644				goto out;
2645			}
2646		} else {
2647			nfs_node_unlock(np);
2648		}
2649		if (ioflag & IO_APPEND) {
2650			nfs_data_unlock(np);
2651			/* nfs_getattr() will check changed and purge caches */
2652			error = nfs_getattr(np, NULL, ctx, NGA_UNCACHED);
2653			/* we'll be extending the file, so take the data lock exclusive */
2654			nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE);
2655			if (error) {
2656				FSDBG(515, np, uio_offset(uio), 0x10bad02, error);
2657				goto out;
2658			}
2659			uio_setoffset(uio, np->n_size);
2660		}
2661	}
2662	if (uio_offset(uio) < 0) {
2663		error = EINVAL;
2664		FSDBG_BOT(515, np, uio_offset(uio), 0xbad0ff, error);
2665		goto out;
2666	}
2667	if (uio_resid(uio) == 0)
2668		goto out;
2669
2670	if (((uio_offset(uio) + uio_resid(uio)) > (off_t)np->n_size) && !(ioflag & IO_APPEND)) {
2671		/* it looks like we'll be extending the file, so take the data lock exclusive */
2672		nfs_data_unlock(np);
2673		nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE);
2674	}
2675
2676	do {
2677		OSAddAtomic64(1, &nfsstats.biocache_writes);
2678		lbn = uio_offset(uio) / biosize;
2679		on = uio_offset(uio) % biosize;
2680		n = biosize - on;
2681		if (uio_resid(uio) < n)
2682			n = uio_resid(uio);
2683again:
2684		/*
2685		 * Get a cache block for writing.  The range to be written is
2686		 * (off..off+n) within the block.  We ensure that the block
2687		 * either has no dirty region or that the given range is
2688		 * contiguous with the existing dirty region.
2689		 */
2690		error = nfs_buf_get(np, lbn, biosize, thd, NBLK_WRITE, &bp);
2691		if (error)
2692			goto out;
2693		/* map the block because we know we're going to write to it */
2694		NFS_BUF_MAP(bp);
2695
2696		if (ioflag & IO_NOCACHE)
2697			SET(bp->nb_flags, NB_NOCACHE);
2698
2699		if (!IS_VALID_CRED(bp->nb_wcred)) {
2700			kauth_cred_ref(cred);
2701			bp->nb_wcred = cred;
2702		}
2703
2704		/*
2705		 * If there's already a dirty range AND dirty pages in this block we
2706		 * need to send a commit AND write the dirty pages before continuing.
2707		 *
2708		 * If there's already a dirty range OR dirty pages in this block
2709		 * and the new write range is not contiguous with the existing range,
2710		 * then force the buffer to be written out now.
2711		 * (We used to just extend the dirty range to cover the valid,
2712		 * but unwritten, data in between also.  But writing ranges
2713		 * of data that weren't actually written by an application
2714		 * risks overwriting some other client's data with stale data
2715		 * that's just masquerading as new written data.)
2716		 */
2717		if (bp->nb_dirtyend > 0) {
2718		    if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
2719			FSDBG(515, np, uio_offset(uio), bp, 0xd15c001);
2720			/* write/commit buffer "synchronously" */
2721			/* (NB_STABLE indicates that data writes should be FILESYNC) */
2722			CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2723			SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2724			error = nfs_buf_write(bp);
2725			if (error)
2726			    goto out;
2727			goto again;
2728		    }
2729		} else if (bp->nb_dirty) {
2730		    int firstpg, lastpg;
2731		    u_int32_t pagemask;
2732		    /* calculate write range pagemask */
2733		    firstpg = on/PAGE_SIZE;
2734		    lastpg = (on+n-1)/PAGE_SIZE;
2735		    pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2736		    /* check if there are dirty pages outside the write range */
2737		    if (bp->nb_dirty & ~pagemask) {
2738			FSDBG(515, np, uio_offset(uio), bp, 0xd15c002);
2739			/* write/commit buffer "synchronously" */
2740			/* (NB_STABLE indicates that data writes should be FILESYNC) */
2741			CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2742			SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2743			error = nfs_buf_write(bp);
2744			if (error)
2745			    goto out;
2746			goto again;
2747		    }
2748		    /* if the first or last pages are already dirty */
2749		    /* make sure that the dirty range encompasses those pages */
2750		    if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
2751			FSDBG(515, np, uio_offset(uio), bp, 0xd15c003);
2752		    	bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
2753			if (NBPGDIRTY(bp,lastpg)) {
2754			    bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
2755			    /* clip to EOF */
2756			    if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size) {
2757				    bp->nb_dirtyend = np->n_size - NBOFF(bp);
2758				    if (bp->nb_dirtyoff >= bp->nb_dirtyend)
2759					    bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2760			    }
2761			} else
2762			    bp->nb_dirtyend = on+n;
2763		    }
2764		}
2765
2766		/*
2767		 * Are we extending the size of the file with this write?
2768		 * If so, update file size now that we have the block.
2769		 * If there was a partial buf at the old eof, validate
2770		 * and zero the new bytes.
2771		 */
2772		if ((uio_offset(uio) + n) > (off_t)np->n_size) {
2773			struct nfsbuf *eofbp = NULL;
2774			daddr64_t eofbn = np->n_size / biosize;
2775			int eofoff = np->n_size % biosize;
2776			int neweofoff = (uio_offset(uio) + n) % biosize;
2777
2778			FSDBG(515, 0xb1ffa000, uio_offset(uio) + n, eofoff, neweofoff);
2779
2780			if (eofoff && (eofbn < lbn) &&
2781			    ((error = nfs_buf_get(np, eofbn, biosize, thd, NBLK_WRITE|NBLK_ONLYVALID, &eofbp))))
2782				goto out;
2783
2784			/* if we're extending within the same last block */
2785			/* and the block is flagged as being cached... */
2786			if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
2787				/* ...check that all pages in buffer are valid */
2788				int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
2789				u_int32_t pagemask;
2790				/* pagemask only has to extend to last page being written to */
2791				pagemask = (1 << (endpg+1)) - 1;
2792				FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
2793				if ((bp->nb_valid & pagemask) != pagemask) {
2794					/* zerofill any hole */
2795					if (on > bp->nb_validend) {
2796						int i;
2797						for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
2798							NBPGVALID_SET(bp, i);
2799						NFS_BUF_MAP(bp);
2800						FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
2801						bzero((char *)bp->nb_data + bp->nb_validend,
2802							on - bp->nb_validend);
2803					}
2804					/* zerofill any trailing data in the last page */
2805					if (neweofoff) {
2806						NFS_BUF_MAP(bp);
2807						FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
2808						bzero((char *)bp->nb_data + neweofoff,
2809							PAGE_SIZE - (neweofoff & PAGE_MASK));
2810					}
2811				}
2812			}
2813			np->n_size = uio_offset(uio) + n;
2814			nfs_node_lock_force(np);
2815			CLR(np->n_flag, NUPDATESIZE);
2816			np->n_flag |= NMODIFIED;
2817			nfs_node_unlock(np);
2818			FSDBG(516, np, np->n_size, np->n_vattr.nva_size, 0xf00d0001);
2819			ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
2820			if (eofbp) {
2821				/*
2822				 * We may need to zero any previously invalid data
2823				 * after the old EOF in the previous EOF buffer.
2824				 *
2825				 * For the old last page, don't zero bytes if there
2826				 * are invalid bytes in that page (i.e. the page isn't
2827				 * currently valid).
2828				 * For pages after the old last page, zero them and
2829				 * mark them as valid.
2830				 */
2831				char *d;
2832				int i;
2833				if (ioflag & IO_NOCACHE)
2834					SET(eofbp->nb_flags, NB_NOCACHE);
2835				NFS_BUF_MAP(eofbp);
2836				FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
2837				d = eofbp->nb_data;
2838				i = eofoff/PAGE_SIZE;
2839				while (eofoff < biosize) {
2840					int poff = eofoff & PAGE_MASK;
2841					if (!poff || NBPGVALID(eofbp,i)) {
2842						bzero(d + eofoff, PAGE_SIZE - poff);
2843						NBPGVALID_SET(eofbp, i);
2844					}
2845					if (bp->nb_validend == eofoff)
2846						bp->nb_validend += PAGE_SIZE - poff;
2847					eofoff += PAGE_SIZE - poff;
2848					i++;
2849				}
2850				nfs_buf_release(eofbp, 1);
2851			}
2852		}
2853		/*
2854		 * If dirtyend exceeds file size, chop it down.  This should
2855		 * not occur unless there is a race.
2856		 */
2857		if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size) {
2858			bp->nb_dirtyend = np->n_size - NBOFF(bp);
2859			if (bp->nb_dirtyoff >= bp->nb_dirtyend)
2860				bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2861		}
2862		/*
2863		 * UBC doesn't handle partial pages, so we need to make sure
2864		 * that any pages left in the page cache are completely valid.
2865		 *
2866		 * Writes that are smaller than a block are delayed if they
2867		 * don't extend to the end of the block.
2868		 *
2869		 * If the block isn't (completely) cached, we may need to read
2870		 * in some parts of pages that aren't covered by the write.
2871		 * If the write offset (on) isn't page aligned, we'll need to
2872		 * read the start of the first page being written to.  Likewise,
2873		 * if the offset of the end of the write (on+n) isn't page aligned,
2874		 * we'll need to read the end of the last page being written to.
2875		 *
2876		 * Notes:
2877		 * We don't want to read anything we're just going to write over.
2878		 * We don't want to read anything we're just going drop when the
2879		 *   I/O is complete (i.e. don't do reads for NOCACHE requests).
2880		 * We don't want to issue multiple I/Os if we don't have to
2881		 *   (because they're synchronous rpcs).
2882		 * We don't want to read anything we already have modified in the
2883		 *   page cache.
2884		 */
2885		if (!ISSET(bp->nb_flags, NB_CACHE) && (n < biosize)) {
2886			int firstpg, lastpg, dirtypg;
2887			int firstpgoff, lastpgoff;
2888			start = end = -1;
2889			firstpg = on/PAGE_SIZE;
2890			firstpgoff = on & PAGE_MASK;
2891			lastpg = (on+n-1)/PAGE_SIZE;
2892			lastpgoff = (on+n) & PAGE_MASK;
2893			if (firstpgoff && !NBPGVALID(bp,firstpg)) {
2894				/* need to read start of first page */
2895				start = firstpg * PAGE_SIZE;
2896				end = start + firstpgoff;
2897			}
2898			if (lastpgoff && !NBPGVALID(bp,lastpg)) {
2899				/* need to read end of last page */
2900				if (start < 0)
2901					start = (lastpg * PAGE_SIZE) + lastpgoff;
2902				end = (lastpg + 1) * PAGE_SIZE;
2903			}
2904			if (ISSET(bp->nb_flags, NB_NOCACHE)) {
2905				/*
2906				 * For nocache writes, if there is any partial page at the
2907				 * start or end of the write range, then we do the write
2908				 * synchronously to make sure that we can drop the data
2909				 * from the cache as soon as the WRITE finishes.  Normally,
2910				 * we would do an unstable write and not drop the data until
2911				 * it was committed.  But doing that here would risk allowing
2912				 * invalid data to be read from the cache between the WRITE
2913				 * and the COMMIT.
2914				 * (NB_STABLE indicates that data writes should be FILESYNC)
2915				 */
2916				if (end > start)
2917					SET(bp->nb_flags, NB_STABLE);
2918				goto skipread;
2919			}
2920			if (end > start) {
2921				/* need to read the data in range: start...end-1 */
2922
2923				/* first, check for dirty pages in between */
2924				/* if there are, we'll have to do two reads because */
2925				/* we don't want to overwrite the dirty pages. */
2926				for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
2927					if (NBPGDIRTY(bp,dirtypg))
2928						break;
2929
2930				/* if start is at beginning of page, try */
2931				/* to get any preceeding pages as well. */
2932				if (!(start & PAGE_MASK)) {
2933					/* stop at next dirty/valid page or start of block */
2934					for (; start > 0; start-=PAGE_SIZE)
2935						if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
2936							break;
2937				}
2938
2939				NFS_BUF_MAP(bp);
2940				/* setup uio for read(s) */
2941				boff = NBOFF(bp);
2942				auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ,
2943					&auio_buf, sizeof(auio_buf));
2944
2945				if (dirtypg <= (end-1)/PAGE_SIZE) {
2946					/* there's a dirty page in the way, so just do two reads */
2947					/* we'll read the preceding data here */
2948					uio_reset(auio, boff + start, UIO_SYSSPACE, UIO_READ);
2949					uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + start), on - start);
2950					error = nfs_read_rpc(np, auio, ctx);
2951					if (error) {
2952						/* couldn't read the data, so treat buffer as synchronous NOCACHE */
2953						SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE));
2954						goto skipread;
2955					}
2956					if (uio_resid(auio) > 0) {
2957						FSDBG(516, bp, (caddr_t)uio_curriovbase(auio) - bp->nb_data, uio_resid(auio), 0xd00dee01);
2958						bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
2959					}
2960					if (!error) {
2961						/* update validoff/validend if necessary */
2962						if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2963							bp->nb_validoff = start;
2964						if ((bp->nb_validend < 0) || (bp->nb_validend < on))
2965							bp->nb_validend = on;
2966						if ((off_t)np->n_size > boff + bp->nb_validend)
2967							bp->nb_validend = min(np->n_size - (boff + start), biosize);
2968						/* validate any pages before the write offset */
2969						for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
2970							NBPGVALID_SET(bp, start/PAGE_SIZE);
2971					}
2972					/* adjust start to read any trailing data */
2973					start = on+n;
2974				}
2975
2976				/* if end is at end of page, try to */
2977				/* get any following pages as well. */
2978				if (!(end & PAGE_MASK)) {
2979					/* stop at next valid page or end of block */
2980					for (; end < biosize; end+=PAGE_SIZE)
2981						if (NBPGVALID(bp,end/PAGE_SIZE))
2982							break;
2983				}
2984
2985				if (((boff+start) >= (off_t)np->n_size) ||
2986				    ((start >= on) && ((boff + on + n) >= (off_t)np->n_size))) {
2987					/*
2988					 * Either this entire read is beyond the current EOF
2989					 * or the range that we won't be modifying (on+n...end)
2990					 * is all beyond the current EOF.
2991					 * No need to make a trip across the network to
2992					 * read nothing.  So, just zero the buffer instead.
2993					 */
2994					FSDBG(516, bp, start, end - start, 0xd00dee00);
2995					bzero(bp->nb_data + start, end - start);
2996					error = 0;
2997				} else {
2998					/* now we'll read the (rest of the) data */
2999					uio_reset(auio, boff + start, UIO_SYSSPACE, UIO_READ);
3000					uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + start), end - start);
3001					error = nfs_read_rpc(np, auio, ctx);
3002					if (error) {
3003						/* couldn't read the data, so treat buffer as synchronous NOCACHE */
3004						SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE));
3005						goto skipread;
3006					}
3007					if (uio_resid(auio) > 0) {
3008						FSDBG(516, bp, (caddr_t)uio_curriovbase(auio) - bp->nb_data, uio_resid(auio), 0xd00dee02);
3009						bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
3010					}
3011				}
3012				if (!error) {
3013					/* update validoff/validend if necessary */
3014					if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
3015						bp->nb_validoff = start;
3016					if ((bp->nb_validend < 0) || (bp->nb_validend < end))
3017						bp->nb_validend = end;
3018					if ((off_t)np->n_size > boff + bp->nb_validend)
3019						bp->nb_validend = min(np->n_size - (boff + start), biosize);
3020					/* validate any pages before the write offset's page */
3021					for (; start < (off_t)trunc_page_32(on); start+=PAGE_SIZE)
3022						NBPGVALID_SET(bp, start/PAGE_SIZE);
3023					/* validate any pages after the range of pages being written to */
3024					for (; (end - 1) > (off_t)round_page_32(on+n-1); end-=PAGE_SIZE)
3025						NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
3026				}
3027				/* Note: pages being written to will be validated when written */
3028			}
3029		}
3030skipread:
3031
3032		if (ISSET(bp->nb_flags, NB_ERROR)) {
3033			error = bp->nb_error;
3034			nfs_buf_release(bp, 1);
3035			goto out;
3036		}
3037
3038		nfs_node_lock_force(np);
3039		np->n_flag |= NMODIFIED;
3040		nfs_node_unlock(np);
3041
3042		NFS_BUF_MAP(bp);
3043		error = uiomove((char *)bp->nb_data + on, n, uio);
3044		if (error) {
3045			SET(bp->nb_flags, NB_ERROR);
3046			nfs_buf_release(bp, 1);
3047			goto out;
3048		}
3049
3050		/* validate any pages written to */
3051		start = on & ~PAGE_MASK;
3052		for (; start < on+n; start += PAGE_SIZE) {
3053			NBPGVALID_SET(bp, start/PAGE_SIZE);
3054			/*
3055			 * This may seem a little weird, but we don't actually set the
3056			 * dirty bits for writes.  This is because we keep the dirty range
3057			 * in the nb_dirtyoff/nb_dirtyend fields.  Also, particularly for
3058			 * delayed writes, when we give the pages back to the VM we don't
3059			 * want to keep them marked dirty, because when we later write the
3060			 * buffer we won't be able to tell which pages were written dirty
3061			 * and which pages were mmapped and dirtied.
3062			 */
3063		}
3064		if (bp->nb_dirtyend > 0) {
3065			bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
3066			bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
3067		} else {
3068			bp->nb_dirtyoff = on;
3069			bp->nb_dirtyend = on + n;
3070		}
3071		if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
3072		    bp->nb_validoff > bp->nb_dirtyend) {
3073			bp->nb_validoff = bp->nb_dirtyoff;
3074			bp->nb_validend = bp->nb_dirtyend;
3075		} else {
3076			bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
3077			bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
3078		}
3079		if (!ISSET(bp->nb_flags, NB_CACHE))
3080			nfs_buf_normalize_valid_range(np, bp);
3081
3082		/*
3083		 * Since this block is being modified, it must be written
3084		 * again and not just committed.
3085		 */
3086		if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3087			nfs_node_lock_force(np);
3088			if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3089				np->n_needcommitcnt--;
3090				CHECK_NEEDCOMMITCNT(np);
3091			}
3092			CLR(bp->nb_flags, NB_NEEDCOMMIT);
3093			nfs_node_unlock(np);
3094		}
3095
3096		if (ioflag & IO_SYNC) {
3097			error = nfs_buf_write(bp);
3098			if (error)
3099				goto out;
3100		} else if (((n + on) == biosize) || (ioflag & IO_APPEND) ||
3101			   (ioflag & IO_NOCACHE) || ISSET(bp->nb_flags, NB_NOCACHE)) {
3102			SET(bp->nb_flags, NB_ASYNC);
3103			error = nfs_buf_write(bp);
3104			if (error)
3105				goto out;
3106		} else {
3107			/* If the block wasn't already delayed: charge for the write */
3108			if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3109				proc_t p = vfs_context_proc(ctx);
3110				if (p && p->p_stats)
3111					OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
3112			}
3113			nfs_buf_write_delayed(bp);
3114		}
3115		if (np->n_needcommitcnt >= NFS_A_LOT_OF_NEEDCOMMITS)
3116		        nfs_flushcommits(np, 1);
3117
3118	} while (uio_resid(uio) > 0 && n > 0);
3119
3120out:
3121	nfs_node_lock_force(np);
3122	np->n_wrbusy--;
3123	nfs_node_unlock(np);
3124	nfs_data_unlock(np);
3125	FSDBG_BOT(515, np, uio_offset(uio), uio_resid(uio), error);
3126	return (error);
3127}
3128
3129
3130/*
3131 * NFS write call
3132 */
3133int
3134nfs_write_rpc(
3135	nfsnode_t np,
3136	uio_t uio,
3137	vfs_context_t ctx,
3138	int *iomodep,
3139	uint64_t *wverfp)
3140{
3141	return nfs_write_rpc2(np, uio, vfs_context_thread(ctx), vfs_context_ucred(ctx), iomodep, wverfp);
3142}
3143
3144int
3145nfs_write_rpc2(
3146	nfsnode_t np,
3147	uio_t uio,
3148	thread_t thd,
3149	kauth_cred_t cred,
3150	int *iomodep,
3151	uint64_t *wverfp)
3152{
3153	struct nfsmount *nmp;
3154	int error = 0, nfsvers;
3155	int wverfset, commit, committed;
3156	uint64_t wverf = 0, wverf2;
3157	size_t nmwsize, totalsize, tsiz, len, rlen;
3158	struct nfsreq rq, *req = &rq;
3159	uint32_t stategenid = 0, vrestart = 0, restart = 0;
3160	uio_t uio_save = NULL;
3161
3162#if DIAGNOSTIC
3163	/* XXX limitation based on need to back up uio on short write */
3164	if (uio_iovcnt(uio) != 1)
3165		panic("nfs3_write_rpc: iovcnt > 1");
3166#endif
3167	FSDBG_TOP(537, np, uio_offset(uio), uio_resid(uio), *iomodep);
3168	nmp = NFSTONMP(np);
3169	if (!nmp)
3170		return (ENXIO);
3171	nfsvers = nmp->nm_vers;
3172	nmwsize = nmp->nm_wsize;
3173
3174	wverfset = 0;
3175	committed = NFS_WRITE_FILESYNC;
3176
3177	totalsize = tsiz = uio_resid(uio);
3178	if ((nfsvers == NFS_VER2) && ((uint64_t)(uio_offset(uio) + tsiz) > 0xffffffffULL)) {
3179		FSDBG_BOT(537, np, uio_offset(uio), uio_resid(uio), EFBIG);
3180		return (EFBIG);
3181	}
3182
3183	uio_save = uio_duplicate(uio);
3184	if (uio_save == NULL) {
3185		return (EIO);
3186	}
3187
3188	while (tsiz > 0) {
3189		len = (tsiz > nmwsize) ? nmwsize : tsiz;
3190		FSDBG(537, np, uio_offset(uio), len, 0);
3191		if (np->n_flag & NREVOKE) {
3192			error = EIO;
3193			break;
3194		}
3195		if (nmp->nm_vers >= NFS_VER4)
3196			stategenid = nmp->nm_stategenid;
3197		error = nmp->nm_funcs->nf_write_rpc_async(np, uio, len, thd, cred, *iomodep, NULL, &req);
3198		if (!error)
3199			error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &commit, &rlen, &wverf2);
3200		nmp = NFSTONMP(np);
3201		if (!nmp)
3202			error = ENXIO;
3203		if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) &&
3204		    (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */
3205			lck_mtx_lock(&nmp->nm_lock);
3206			if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) {
3207				NP(np, "nfs_write_rpc: error %d, initiating recovery", error);
3208				nfs_need_recover(nmp, error);
3209			}
3210			lck_mtx_unlock(&nmp->nm_lock);
3211			if (np->n_flag & NREVOKE) {
3212				error = EIO;
3213			} else {
3214				if (error == NFSERR_GRACE)
3215					tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
3216				if (!(error = nfs_mount_state_wait_for_recovery(nmp)))
3217					continue;
3218			}
3219		}
3220		if (error)
3221			break;
3222		if (nfsvers == NFS_VER2) {
3223			tsiz -= len;
3224			continue;
3225		}
3226
3227		/* check for a short write */
3228		if (rlen < len) {
3229			/* Reset the uio to reflect the actual transfer */
3230			*uio = *uio_save;
3231			uio_update(uio, totalsize - (tsiz - rlen));
3232			len = rlen;
3233		}
3234
3235		/* return lowest commit level returned */
3236		if (commit < committed)
3237			committed = commit;
3238
3239		tsiz -= len;
3240
3241		/* check write verifier */
3242		if (!wverfset) {
3243			wverf = wverf2;
3244			wverfset = 1;
3245		} else if (wverf != wverf2) {
3246			/* verifier changed, so we need to restart all the writes */
3247			if (++vrestart > 100) {
3248				/* give up after too many restarts */
3249				error = EIO;
3250				break;
3251			}
3252			*uio = *uio_save;	// Reset the uio back to the start
3253			committed = NFS_WRITE_FILESYNC;
3254			wverfset = 0;
3255			tsiz = totalsize;
3256		}
3257	}
3258	if (uio_save)
3259		uio_free(uio_save);
3260	if (wverfset && wverfp)
3261		*wverfp = wverf;
3262	*iomodep = committed;
3263	if (error)
3264		uio_setresid(uio, tsiz);
3265	FSDBG_BOT(537, np, committed, uio_resid(uio), error);
3266	return (error);
3267}
3268
3269int
3270nfs3_write_rpc_async(
3271	nfsnode_t np,
3272	uio_t uio,
3273	size_t len,
3274	thread_t thd,
3275	kauth_cred_t cred,
3276	int iomode,
3277	struct nfsreq_cbinfo *cb,
3278	struct nfsreq **reqp)
3279{
3280	struct nfsmount *nmp;
3281	mount_t mp;
3282	int error = 0, nfsvers;
3283	struct nfsm_chain nmreq;
3284
3285	nmp = NFSTONMP(np);
3286	if (!nmp)
3287		return (ENXIO);
3288	nfsvers = nmp->nm_vers;
3289
3290	/* for async mounts, don't bother sending sync write requests */
3291	if ((iomode != NFS_WRITE_UNSTABLE) && nfs_allow_async &&
3292	    ((mp = NFSTOMP(np))) && (vfs_flags(mp) & MNT_ASYNC))
3293		iomode = NFS_WRITE_UNSTABLE;
3294
3295	nfsm_chain_null(&nmreq);
3296	nfsm_chain_build_alloc_init(error, &nmreq,
3297		NFSX_FH(nfsvers) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
3298	nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize);
3299	if (nfsvers == NFS_VER3) {
3300		nfsm_chain_add_64(error, &nmreq, uio_offset(uio));
3301		nfsm_chain_add_32(error, &nmreq, len);
3302		nfsm_chain_add_32(error, &nmreq, iomode);
3303	} else {
3304		nfsm_chain_add_32(error, &nmreq, 0);
3305		nfsm_chain_add_32(error, &nmreq, uio_offset(uio));
3306		nfsm_chain_add_32(error, &nmreq, 0);
3307	}
3308	nfsm_chain_add_32(error, &nmreq, len);
3309	nfsmout_if(error);
3310	error = nfsm_chain_add_uio(&nmreq, uio, len);
3311	nfsm_chain_build_done(error, &nmreq);
3312	nfsmout_if(error);
3313	error = nfs_request_async(np, NULL, &nmreq, NFSPROC_WRITE, thd, cred, NULL, 0, cb, reqp);
3314nfsmout:
3315	nfsm_chain_cleanup(&nmreq);
3316	return (error);
3317}
3318
3319int
3320nfs3_write_rpc_async_finish(
3321	nfsnode_t np,
3322	struct nfsreq *req,
3323	int *iomodep,
3324	size_t *rlenp,
3325	uint64_t *wverfp)
3326{
3327	struct nfsmount *nmp;
3328	int error = 0, lockerror = ENOENT, nfsvers, status;
3329	int updatemtime = 0, wccpostattr = 0, rlen, committed = NFS_WRITE_FILESYNC;
3330	u_int64_t xid, wverf;
3331	mount_t mp;
3332	struct nfsm_chain nmrep;
3333
3334	nmp = NFSTONMP(np);
3335	if (!nmp) {
3336		nfs_request_async_cancel(req);
3337		return (ENXIO);
3338	}
3339	nfsvers = nmp->nm_vers;
3340
3341	nfsm_chain_null(&nmrep);
3342
3343	error = nfs_request_async_finish(req, &nmrep, &xid, &status);
3344	if (error == EINPROGRESS) /* async request restarted */
3345		return (error);
3346	nmp = NFSTONMP(np);
3347	if (!nmp)
3348		error = ENXIO;
3349	if (!error && (lockerror = nfs_node_lock(np)))
3350		error = lockerror;
3351	if (nfsvers == NFS_VER3) {
3352		struct timespec premtime = { 0, 0 };
3353		nfsm_chain_get_wcc_data(error, &nmrep, np, &premtime, &wccpostattr, &xid);
3354		if (nfstimespeccmp(&np->n_mtime, &premtime, ==))
3355			updatemtime = 1;
3356		if (!error)
3357			error = status;
3358		nfsm_chain_get_32(error, &nmrep, rlen);
3359		nfsmout_if(error);
3360		*rlenp = rlen;
3361		if (rlen <= 0)
3362			error = NFSERR_IO;
3363		nfsm_chain_get_32(error, &nmrep, committed);
3364		nfsm_chain_get_64(error, &nmrep, wverf);
3365		nfsmout_if(error);
3366		if (wverfp)
3367			*wverfp = wverf;
3368		lck_mtx_lock(&nmp->nm_lock);
3369		if (!(nmp->nm_state & NFSSTA_HASWRITEVERF)) {
3370			nmp->nm_verf = wverf;
3371			nmp->nm_state |= NFSSTA_HASWRITEVERF;
3372		} else if (nmp->nm_verf != wverf) {
3373			nmp->nm_verf = wverf;
3374		}
3375		lck_mtx_unlock(&nmp->nm_lock);
3376	} else {
3377		if (!error)
3378			error = status;
3379		nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid);
3380		nfsmout_if(error);
3381	}
3382	if (updatemtime)
3383		NFS_CHANGED_UPDATE(nfsvers, np, &np->n_vattr);
3384nfsmout:
3385	if (!lockerror)
3386		nfs_node_unlock(np);
3387	nfsm_chain_cleanup(&nmrep);
3388	if ((committed != NFS_WRITE_FILESYNC) && nfs_allow_async &&
3389	    ((mp = NFSTOMP(np))) && (vfs_flags(mp) & MNT_ASYNC))
3390		committed = NFS_WRITE_FILESYNC;
3391	*iomodep = committed;
3392	return (error);
3393}
3394
3395/*
3396 * NFS mknod vnode op
3397 *
3398 * For NFS v2 this is a kludge. Use a create RPC but with the IFMT bits of the
3399 * mode set to specify the file type and the size field for rdev.
3400 */
3401int
3402nfs3_vnop_mknod(
3403	struct vnop_mknod_args /* {
3404		struct vnodeop_desc *a_desc;
3405		vnode_t a_dvp;
3406		vnode_t *a_vpp;
3407		struct componentname *a_cnp;
3408		struct vnode_attr *a_vap;
3409		vfs_context_t a_context;
3410	} */ *ap)
3411{
3412	vnode_t dvp = ap->a_dvp;
3413	vnode_t *vpp = ap->a_vpp;
3414	struct componentname *cnp = ap->a_cnp;
3415	struct vnode_attr *vap = ap->a_vap;
3416	vfs_context_t ctx = ap->a_context;
3417	vnode_t newvp = NULL;
3418	nfsnode_t np = NULL;
3419	struct nfsmount *nmp;
3420	nfsnode_t dnp = VTONFS(dvp);
3421	struct nfs_vattr nvattr;
3422	fhandle_t fh;
3423	int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0;
3424	struct timespec premtime = { 0, 0 };
3425	u_int32_t rdev;
3426	u_int64_t xid = 0, dxid;
3427	int nfsvers, gotuid, gotgid;
3428	struct nfsm_chain nmreq, nmrep;
3429	struct nfsreq rq, *req = &rq;
3430
3431	nmp = VTONMP(dvp);
3432	if (!nmp)
3433		return (ENXIO);
3434	nfsvers = nmp->nm_vers;
3435
3436	if (!VATTR_IS_ACTIVE(vap, va_type))
3437		return (EINVAL);
3438	if (vap->va_type == VCHR || vap->va_type == VBLK) {
3439		if (!VATTR_IS_ACTIVE(vap, va_rdev))
3440			return (EINVAL);
3441		rdev = vap->va_rdev;
3442	} else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
3443		rdev = 0xffffffff;
3444	else {
3445		return (ENOTSUP);
3446	}
3447	if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN))
3448		return (ENAMETOOLONG);
3449
3450	nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx);
3451
3452	VATTR_SET_SUPPORTED(vap, va_mode);
3453	VATTR_SET_SUPPORTED(vap, va_uid);
3454	VATTR_SET_SUPPORTED(vap, va_gid);
3455	VATTR_SET_SUPPORTED(vap, va_data_size);
3456	VATTR_SET_SUPPORTED(vap, va_access_time);
3457	VATTR_SET_SUPPORTED(vap, va_modify_time);
3458	gotuid = VATTR_IS_ACTIVE(vap, va_uid);
3459	gotgid = VATTR_IS_ACTIVE(vap, va_gid);
3460
3461	nfsm_chain_null(&nmreq);
3462	nfsm_chain_null(&nmrep);
3463
3464	nfsm_chain_build_alloc_init(error, &nmreq,
3465		NFSX_FH(nfsvers) + 4 * NFSX_UNSIGNED +
3466		nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(nfsvers));
3467	nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
3468	nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp);
3469	if (nfsvers == NFS_VER3) {
3470		nfsm_chain_add_32(error, &nmreq, vtonfs_type(vap->va_type, nfsvers));
3471		nfsm_chain_add_v3sattr(error, &nmreq, vap);
3472		if (vap->va_type == VCHR || vap->va_type == VBLK) {
3473			nfsm_chain_add_32(error, &nmreq, major(vap->va_rdev));
3474			nfsm_chain_add_32(error, &nmreq, minor(vap->va_rdev));
3475		}
3476	} else {
3477		nfsm_chain_add_v2sattr(error, &nmreq, vap, rdev);
3478	}
3479	nfsm_chain_build_done(error, &nmreq);
3480	if (!error)
3481		error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx));
3482	nfsmout_if(error);
3483
3484	error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_MKNOD,
3485			vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req);
3486	if (!error)
3487		error = nfs_request_async_finish(req, &nmrep, &xid, &status);
3488
3489	if ((lockerror = nfs_node_lock(dnp)))
3490		error = lockerror;
3491	/* XXX no EEXIST kludge here? */
3492	dxid = xid;
3493	if (!error && !status) {
3494		if (dnp->n_flag & NNEGNCENTRIES) {
3495			dnp->n_flag &= ~NNEGNCENTRIES;
3496			cache_purge_negatives(dvp);
3497		}
3498		error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
3499	}
3500	if (nfsvers == NFS_VER3)
3501		nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid);
3502	if (!error)
3503		error = status;
3504nfsmout:
3505	nfsm_chain_cleanup(&nmreq);
3506	nfsm_chain_cleanup(&nmrep);
3507
3508	if (!lockerror) {
3509		dnp->n_flag |= NMODIFIED;
3510		/* if directory hadn't changed, update namecache mtime */
3511		if (nfstimespeccmp(&dnp->n_ncmtime, &premtime, ==))
3512			NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr);
3513		nfs_node_unlock(dnp);
3514		/* nfs_getattr() will check changed and purge caches */
3515		nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED);
3516	}
3517
3518	if (!error && fh.fh_len)
3519		error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np);
3520	if (!error && !np)
3521		error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np);
3522	if (!error && np)
3523		newvp = NFSTOV(np);
3524	if (!busyerror)
3525		nfs_node_clear_busy(dnp);
3526
3527	if (!error && (gotuid || gotgid) &&
3528	    (!newvp || nfs_getattrcache(np, &nvattr, 0) ||
3529	     (gotuid && (nvattr.nva_uid != vap->va_uid)) ||
3530	     (gotgid && (nvattr.nva_gid != vap->va_gid)))) {
3531		/* clear ID bits if server didn't use them (or we can't tell) */
3532		VATTR_CLEAR_SUPPORTED(vap, va_uid);
3533		VATTR_CLEAR_SUPPORTED(vap, va_gid);
3534	}
3535	if (error) {
3536		if (newvp) {
3537			nfs_node_unlock(np);
3538			vnode_put(newvp);
3539		}
3540	} else {
3541		*vpp = newvp;
3542		nfs_node_unlock(np);
3543	}
3544	return (error);
3545}
3546
3547static uint32_t create_verf;
3548/*
3549 * NFS file create call
3550 */
3551int
3552nfs3_vnop_create(
3553	struct vnop_create_args /* {
3554		struct vnodeop_desc *a_desc;
3555		vnode_t a_dvp;
3556		vnode_t *a_vpp;
3557		struct componentname *a_cnp;
3558		struct vnode_attr *a_vap;
3559		vfs_context_t a_context;
3560	} */ *ap)
3561{
3562	vfs_context_t ctx = ap->a_context;
3563	vnode_t dvp = ap->a_dvp;
3564	struct vnode_attr *vap = ap->a_vap;
3565	struct componentname *cnp = ap->a_cnp;
3566	struct nfs_vattr nvattr;
3567	fhandle_t fh;
3568	nfsnode_t np = NULL;
3569	struct nfsmount *nmp;
3570	nfsnode_t dnp = VTONFS(dvp);
3571	vnode_t newvp = NULL;
3572	int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0, fmode = 0;
3573	struct timespec premtime = { 0, 0 };
3574	int nfsvers, gotuid, gotgid;
3575	u_int64_t xid, dxid;
3576	uint32_t val;
3577	struct nfsm_chain nmreq, nmrep;
3578	struct nfsreq rq, *req = &rq;
3579	struct nfs_dulookup dul;
3580
3581	nmp = VTONMP(dvp);
3582	if (!nmp)
3583		return (ENXIO);
3584	nfsvers = nmp->nm_vers;
3585
3586	if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN))
3587		return (ENAMETOOLONG);
3588
3589	nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx);
3590
3591	VATTR_SET_SUPPORTED(vap, va_mode);
3592	VATTR_SET_SUPPORTED(vap, va_uid);
3593	VATTR_SET_SUPPORTED(vap, va_gid);
3594	VATTR_SET_SUPPORTED(vap, va_data_size);
3595	VATTR_SET_SUPPORTED(vap, va_access_time);
3596	VATTR_SET_SUPPORTED(vap, va_modify_time);
3597	gotuid = VATTR_IS_ACTIVE(vap, va_uid);
3598	gotgid = VATTR_IS_ACTIVE(vap, va_gid);
3599
3600	if (vap->va_vaflags & VA_EXCLUSIVE) {
3601		fmode |= O_EXCL;
3602		if (!VATTR_IS_ACTIVE(vap, va_access_time) || !VATTR_IS_ACTIVE(vap, va_modify_time))
3603			vap->va_vaflags |= VA_UTIMES_NULL;
3604	}
3605
3606again:
3607	error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx));
3608	nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
3609
3610	nfsm_chain_null(&nmreq);
3611	nfsm_chain_null(&nmrep);
3612
3613	nfsm_chain_build_alloc_init(error, &nmreq,
3614		NFSX_FH(nfsvers) + 2 * NFSX_UNSIGNED +
3615		nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(nfsvers));
3616	nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
3617	nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp);
3618	if (nfsvers == NFS_VER3) {
3619		if (fmode & O_EXCL) {
3620			nfsm_chain_add_32(error, &nmreq, NFS_CREATE_EXCLUSIVE);
3621			lck_rw_lock_shared(in_ifaddr_rwlock);
3622			if (!TAILQ_EMPTY(&in_ifaddrhead))
3623				val = IA_SIN(in_ifaddrhead.tqh_first)->sin_addr.s_addr;
3624			else
3625				val = create_verf;
3626			lck_rw_done(in_ifaddr_rwlock);
3627			nfsm_chain_add_32(error, &nmreq, val);
3628			++create_verf;
3629			nfsm_chain_add_32(error, &nmreq, create_verf);
3630		} else {
3631			nfsm_chain_add_32(error, &nmreq, NFS_CREATE_UNCHECKED);
3632			nfsm_chain_add_v3sattr(error, &nmreq, vap);
3633		}
3634	} else {
3635		nfsm_chain_add_v2sattr(error, &nmreq, vap, 0);
3636	}
3637	nfsm_chain_build_done(error, &nmreq);
3638	nfsmout_if(error);
3639
3640	error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_CREATE,
3641			vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req);
3642	if (!error) {
3643		nfs_dulookup_start(&dul, dnp, ctx);
3644		error = nfs_request_async_finish(req, &nmrep, &xid, &status);
3645	}
3646
3647	if ((lockerror = nfs_node_lock(dnp)))
3648		error = lockerror;
3649	dxid = xid;
3650	if (!error && !status) {
3651		if (dnp->n_flag & NNEGNCENTRIES) {
3652			dnp->n_flag &= ~NNEGNCENTRIES;
3653			cache_purge_negatives(dvp);
3654		}
3655		error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
3656	}
3657	if (nfsvers == NFS_VER3)
3658		nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid);
3659	if (!error)
3660		error = status;
3661nfsmout:
3662	nfsm_chain_cleanup(&nmreq);
3663	nfsm_chain_cleanup(&nmrep);
3664
3665	if (!lockerror) {
3666		dnp->n_flag |= NMODIFIED;
3667		/* if directory hadn't changed, update namecache mtime */
3668		if (nfstimespeccmp(&dnp->n_ncmtime, &premtime, ==))
3669			NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr);
3670		nfs_node_unlock(dnp);
3671		/* nfs_getattr() will check changed and purge caches */
3672		nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED);
3673	}
3674
3675	if (!error && fh.fh_len)
3676		error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np);
3677	if (!error && !np)
3678		error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np);
3679	if (!error && np)
3680		newvp = NFSTOV(np);
3681
3682	nfs_dulookup_finish(&dul, dnp, ctx);
3683	if (!busyerror)
3684		nfs_node_clear_busy(dnp);
3685
3686	if (error) {
3687		if ((nfsvers == NFS_VER3) && (fmode & O_EXCL) && (error == NFSERR_NOTSUPP)) {
3688			fmode &= ~O_EXCL;
3689			goto again;
3690		}
3691		if (newvp) {
3692			nfs_node_unlock(np);
3693			vnode_put(newvp);
3694		}
3695	} else if ((nfsvers == NFS_VER3) && (fmode & O_EXCL)) {
3696		nfs_node_unlock(np);
3697		error = nfs3_setattr_rpc(np, vap, ctx);
3698		if (error && (gotuid || gotgid)) {
3699			/* it's possible the server didn't like our attempt to set IDs. */
3700			/* so, let's try it again without those */
3701			VATTR_CLEAR_ACTIVE(vap, va_uid);
3702			VATTR_CLEAR_ACTIVE(vap, va_gid);
3703			error = nfs3_setattr_rpc(np, vap, ctx);
3704		}
3705		if (error)
3706			vnode_put(newvp);
3707		else
3708			nfs_node_lock_force(np);
3709	}
3710	if (!error)
3711		*ap->a_vpp = newvp;
3712	if (!error && (gotuid || gotgid) &&
3713	    (!newvp || nfs_getattrcache(np, &nvattr, 0) ||
3714	     (gotuid && (nvattr.nva_uid != vap->va_uid)) ||
3715	     (gotgid && (nvattr.nva_gid != vap->va_gid)))) {
3716		/* clear ID bits if server didn't use them (or we can't tell) */
3717		VATTR_CLEAR_SUPPORTED(vap, va_uid);
3718		VATTR_CLEAR_SUPPORTED(vap, va_gid);
3719	}
3720	if (!error)
3721		nfs_node_unlock(np);
3722	return (error);
3723}
3724
3725/*
3726 * NFS file remove call
3727 * To try and make NFS semantics closer to UFS semantics, a file that has
3728 * other processes using the vnode is renamed instead of removed and then
3729 * removed later on the last close.
3730 * - If vnode_isinuse()
3731 *	  If a rename is not already in the works
3732 *	     call nfs_sillyrename() to set it up
3733 *     else
3734 *	  do the remove RPC
3735 */
3736int
3737nfs_vnop_remove(
3738	struct vnop_remove_args /* {
3739		struct vnodeop_desc *a_desc;
3740		vnode_t a_dvp;
3741		vnode_t a_vp;
3742		struct componentname *a_cnp;
3743		int a_flags;
3744		vfs_context_t a_context;
3745	} */ *ap)
3746{
3747	vfs_context_t ctx = ap->a_context;
3748	vnode_t vp = ap->a_vp;
3749	vnode_t dvp = ap->a_dvp;
3750	struct componentname *cnp = ap->a_cnp;
3751	nfsnode_t dnp = VTONFS(dvp);
3752	nfsnode_t np = VTONFS(vp);
3753	int error = 0, nfsvers, namedattrs, inuse, gotattr = 0, flushed = 0, setsize = 0;
3754	struct nfs_vattr nvattr;
3755	struct nfsmount *nmp;
3756	struct nfs_dulookup dul;
3757
3758	/* XXX prevent removing a sillyrenamed file? */
3759
3760	nmp = NFSTONMP(dnp);
3761	if (!nmp)
3762		return (ENXIO);
3763	nfsvers = nmp->nm_vers;
3764	namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR);
3765
3766again_relock:
3767	error = nfs_node_set_busy2(dnp, np, vfs_context_thread(ctx));
3768	if (error)
3769		return (error);
3770
3771	/* lock the node while we remove the file */
3772	lck_mtx_lock(nfs_node_hash_mutex);
3773	while (np->n_hflag & NHLOCKED) {
3774		np->n_hflag |= NHLOCKWANT;
3775		msleep(np, nfs_node_hash_mutex, PINOD, "nfs_remove", NULL);
3776	}
3777	np->n_hflag |= NHLOCKED;
3778	lck_mtx_unlock(nfs_node_hash_mutex);
3779
3780	if (!namedattrs)
3781		nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
3782again:
3783	inuse = vnode_isinuse(vp, 0);
3784	if ((ap->a_flags & VNODE_REMOVE_NODELETEBUSY) && inuse) {
3785		/* Caller requested Carbon delete semantics, but file is busy */
3786		error = EBUSY;
3787		goto out;
3788	}
3789	if (inuse && !gotattr) {
3790		if (nfs_getattr(np, &nvattr, ctx, NGA_CACHED))
3791			nvattr.nva_nlink = 1;
3792		gotattr = 1;
3793		goto again;
3794	}
3795	if (!inuse || (np->n_sillyrename && (nvattr.nva_nlink > 1))) {
3796
3797		if (!inuse && !flushed) { /* flush all the buffers first */
3798			/* unlock the node */
3799			lck_mtx_lock(nfs_node_hash_mutex);
3800			np->n_hflag &= ~NHLOCKED;
3801			if (np->n_hflag & NHLOCKWANT) {
3802				np->n_hflag &= ~NHLOCKWANT;
3803				wakeup(np);
3804			}
3805			lck_mtx_unlock(nfs_node_hash_mutex);
3806			nfs_node_clear_busy2(dnp, np);
3807			error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
3808			FSDBG(260, np, np->n_size, np->n_vattr.nva_size, 0xf00d0011);
3809			flushed = 1;
3810			if (error == EINTR) {
3811				nfs_node_lock_force(np);
3812				NATTRINVALIDATE(np);
3813				nfs_node_unlock(np);
3814				return (error);
3815			}
3816			if (!namedattrs)
3817				nfs_dulookup_finish(&dul, dnp, ctx);
3818			goto again_relock;
3819		}
3820
3821		if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK))
3822			nfs4_delegation_return(np, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx));
3823
3824		/*
3825		 * Purge the name cache so that the chance of a lookup for
3826		 * the name succeeding while the remove is in progress is
3827		 * minimized.
3828		 */
3829		nfs_name_cache_purge(dnp, np, cnp, ctx);
3830
3831		if (!namedattrs)
3832			nfs_dulookup_start(&dul, dnp, ctx);
3833
3834		/* Do the rpc */
3835		error = nmp->nm_funcs->nf_remove_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen,
3836				vfs_context_thread(ctx), vfs_context_ucred(ctx));
3837
3838		/*
3839		 * Kludge City: If the first reply to the remove rpc is lost..
3840		 *   the reply to the retransmitted request will be ENOENT
3841		 *   since the file was in fact removed
3842		 *   Therefore, we cheat and return success.
3843		 */
3844		if (error == ENOENT)
3845			error = 0;
3846
3847		if (!error && !inuse && !np->n_sillyrename) {
3848			/*
3849			 * removal succeeded, it's not in use, and not silly renamed so
3850			 * remove nfsnode from hash now so we can't accidentally find it
3851			 * again if another object gets created with the same filehandle
3852			 * before this vnode gets reclaimed
3853			 */
3854			lck_mtx_lock(nfs_node_hash_mutex);
3855			if (np->n_hflag & NHHASHED) {
3856				LIST_REMOVE(np, n_hash);
3857				np->n_hflag &= ~NHHASHED;
3858				FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
3859			}
3860			lck_mtx_unlock(nfs_node_hash_mutex);
3861			/* clear flags now: won't get nfs_vnop_inactive for recycled vnode */
3862			/* clear all flags other than these */
3863			nfs_node_lock_force(np);
3864			np->n_flag &= (NMODIFIED);
3865			NATTRINVALIDATE(np);
3866			nfs_node_unlock(np);
3867			vnode_recycle(vp);
3868			setsize = 1;
3869		} else {
3870			nfs_node_lock_force(np);
3871			NATTRINVALIDATE(np);
3872			nfs_node_unlock(np);
3873		}
3874	} else if (!np->n_sillyrename) {
3875		if (!namedattrs)
3876			nfs_dulookup_start(&dul, dnp, ctx);
3877		error = nfs_sillyrename(dnp, np, cnp, ctx);
3878		nfs_node_lock_force(np);
3879		NATTRINVALIDATE(np);
3880		nfs_node_unlock(np);
3881	} else {
3882		nfs_node_lock_force(np);
3883		NATTRINVALIDATE(np);
3884		nfs_node_unlock(np);
3885		if (!namedattrs)
3886			nfs_dulookup_start(&dul, dnp, ctx);
3887	}
3888
3889	/* nfs_getattr() will check changed and purge caches */
3890	nfs_getattr(dnp, NULL, ctx, NGA_CACHED);
3891	if (!namedattrs)
3892		nfs_dulookup_finish(&dul, dnp, ctx);
3893out:
3894	/* unlock the node */
3895	lck_mtx_lock(nfs_node_hash_mutex);
3896	np->n_hflag &= ~NHLOCKED;
3897	if (np->n_hflag & NHLOCKWANT) {
3898		np->n_hflag &= ~NHLOCKWANT;
3899		wakeup(np);
3900	}
3901	lck_mtx_unlock(nfs_node_hash_mutex);
3902	nfs_node_clear_busy2(dnp, np);
3903	if (setsize)
3904		ubc_setsize(vp, 0);
3905	return (error);
3906}
3907
3908/*
3909 * NFS silly-renamed file removal function called from nfs_vnop_inactive
3910 */
3911int
3912nfs_removeit(struct nfs_sillyrename *nsp)
3913{
3914	struct nfsmount *nmp = NFSTONMP(nsp->nsr_dnp);
3915	if (!nmp)
3916		return (ENXIO);
3917	return nmp->nm_funcs->nf_remove_rpc(nsp->nsr_dnp, nsp->nsr_name, nsp->nsr_namlen, NULL, nsp->nsr_cred);
3918}
3919
3920/*
3921 * NFS remove rpc, called from nfs_remove() and nfs_removeit().
3922 */
3923int
3924nfs3_remove_rpc(
3925	nfsnode_t dnp,
3926	char *name,
3927	int namelen,
3928	thread_t thd,
3929	kauth_cred_t cred)
3930{
3931	int error = 0, lockerror = ENOENT, status, wccpostattr = 0;
3932	struct timespec premtime = { 0, 0 };
3933	struct nfsmount *nmp;
3934	int nfsvers;
3935	u_int64_t xid;
3936	struct nfsm_chain nmreq, nmrep;
3937
3938	nmp = NFSTONMP(dnp);
3939	if (!nmp)
3940		return (ENXIO);
3941	nfsvers = nmp->nm_vers;
3942	if ((nfsvers == NFS_VER2) && (namelen > NFS_MAXNAMLEN))
3943		return (ENAMETOOLONG);
3944
3945	nfsm_chain_null(&nmreq);
3946	nfsm_chain_null(&nmrep);
3947
3948	nfsm_chain_build_alloc_init(error, &nmreq,
3949		NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(namelen));
3950	nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
3951	nfsm_chain_add_name(error, &nmreq, name, namelen, nmp);
3952	nfsm_chain_build_done(error, &nmreq);
3953	nfsmout_if(error);
3954
3955	error = nfs_request2(dnp, NULL, &nmreq, NFSPROC_REMOVE, thd, cred, NULL, 0, &nmrep, &xid, &status);
3956
3957	if ((lockerror = nfs_node_lock(dnp)))
3958		error = lockerror;
3959	if (nfsvers == NFS_VER3)
3960		nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &xid);
3961	nfsmout_if(error);
3962	dnp->n_flag |= NMODIFIED;
3963	/* if directory hadn't changed, update namecache mtime */
3964	if (nfstimespeccmp(&dnp->n_ncmtime, &premtime, ==))
3965		NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr);
3966	if (!wccpostattr)
3967		NATTRINVALIDATE(dnp);
3968	if (!error)
3969		error = status;
3970nfsmout:
3971	if (!lockerror)
3972		nfs_node_unlock(dnp);
3973	nfsm_chain_cleanup(&nmreq);
3974	nfsm_chain_cleanup(&nmrep);
3975	return (error);
3976}
3977
3978/*
3979 * NFS file rename call
3980 */
3981int
3982nfs_vnop_rename(
3983	struct vnop_rename_args  /* {
3984		struct vnodeop_desc *a_desc;
3985		vnode_t a_fdvp;
3986		vnode_t a_fvp;
3987		struct componentname *a_fcnp;
3988		vnode_t a_tdvp;
3989		vnode_t a_tvp;
3990		struct componentname *a_tcnp;
3991		vfs_context_t a_context;
3992	} */ *ap)
3993{
3994	vfs_context_t ctx = ap->a_context;
3995	vnode_t fdvp = ap->a_fdvp;
3996	vnode_t fvp = ap->a_fvp;
3997	vnode_t tdvp = ap->a_tdvp;
3998	vnode_t tvp = ap->a_tvp;
3999	nfsnode_t fdnp, fnp, tdnp, tnp;
4000	struct componentname *tcnp = ap->a_tcnp;
4001	struct componentname *fcnp = ap->a_fcnp;
4002	int error, nfsvers, inuse=0, tvprecycle=0, locked=0;
4003	mount_t fmp, tdmp, tmp;
4004	struct nfs_vattr nvattr;
4005	struct nfsmount *nmp;
4006
4007	fdnp = VTONFS(fdvp);
4008	fnp = VTONFS(fvp);
4009	tdnp = VTONFS(tdvp);
4010	tnp = tvp ? VTONFS(tvp) : NULL;
4011
4012	nmp = NFSTONMP(fdnp);
4013	if (!nmp)
4014		return (ENXIO);
4015	nfsvers = nmp->nm_vers;
4016
4017	error = nfs_node_set_busy4(fdnp, fnp, tdnp, tnp, vfs_context_thread(ctx));
4018	if (error)
4019		return (error);
4020
4021	if (tvp && (tvp != fvp)) {
4022		/* lock the node while we rename over the existing file */
4023		lck_mtx_lock(nfs_node_hash_mutex);
4024		while (tnp->n_hflag & NHLOCKED) {
4025			tnp->n_hflag |= NHLOCKWANT;
4026			msleep(tnp, nfs_node_hash_mutex, PINOD, "nfs_rename", NULL);
4027		}
4028		tnp->n_hflag |= NHLOCKED;
4029		lck_mtx_unlock(nfs_node_hash_mutex);
4030		locked = 1;
4031	}
4032
4033	/* Check for cross-device rename */
4034	fmp = vnode_mount(fvp);
4035	tmp = tvp ? vnode_mount(tvp) : NULL;
4036	tdmp = vnode_mount(tdvp);
4037	if ((fmp != tdmp) || (tvp && (fmp != tmp))) {
4038		error = EXDEV;
4039		goto out;
4040	}
4041
4042	/* XXX prevent renaming from/over a sillyrenamed file? */
4043
4044	/*
4045	 * If the tvp exists and is in use, sillyrename it before doing the
4046	 * rename of the new file over it.
4047	 * XXX Can't sillyrename a directory.
4048	 * Don't sillyrename if source and target are same vnode (hard
4049	 * links or case-variants)
4050	 */
4051	if (tvp && (tvp != fvp))
4052		inuse = vnode_isinuse(tvp, 0);
4053	if (inuse && !tnp->n_sillyrename && (vnode_vtype(tvp) != VDIR)) {
4054		error = nfs_sillyrename(tdnp, tnp, tcnp, ctx);
4055		if (error) {
4056			/* sillyrename failed. Instead of pressing on, return error */
4057			goto out; /* should not be ENOENT. */
4058		} else {
4059			/* sillyrename succeeded.*/
4060			tvp = NULL;
4061		}
4062	} else if (tvp && (nmp->nm_vers >= NFS_VER4) && (tnp->n_openflags & N_DELEG_MASK)) {
4063		nfs4_delegation_return(tnp, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx));
4064	}
4065
4066	error = nmp->nm_funcs->nf_rename_rpc(fdnp, fcnp->cn_nameptr, fcnp->cn_namelen,
4067			tdnp, tcnp->cn_nameptr, tcnp->cn_namelen, ctx);
4068
4069	/*
4070	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
4071	 */
4072	if (error == ENOENT)
4073		error = 0;
4074
4075	if (tvp && (tvp != fvp) && !tnp->n_sillyrename) {
4076		nfs_node_lock_force(tnp);
4077		tvprecycle = (!error && !vnode_isinuse(tvp, 0) &&
4078		    (nfs_getattrcache(tnp, &nvattr, 0) || (nvattr.nva_nlink == 1)));
4079		nfs_node_unlock(tnp);
4080		lck_mtx_lock(nfs_node_hash_mutex);
4081		if (tvprecycle && (tnp->n_hflag & NHHASHED)) {
4082			/*
4083			 * remove nfsnode from hash now so we can't accidentally find it
4084			 * again if another object gets created with the same filehandle
4085			 * before this vnode gets reclaimed
4086			 */
4087			LIST_REMOVE(tnp, n_hash);
4088			tnp->n_hflag &= ~NHHASHED;
4089			FSDBG(266, 0, tnp, tnp->n_flag, 0xb1eb1e);
4090		}
4091		lck_mtx_unlock(nfs_node_hash_mutex);
4092	}
4093
4094	/* purge the old name cache entries and enter the new one */
4095	nfs_name_cache_purge(fdnp, fnp, fcnp, ctx);
4096	if (tvp) {
4097		nfs_name_cache_purge(tdnp, tnp, tcnp, ctx);
4098		if (tvprecycle) {
4099			/* clear flags now: won't get nfs_vnop_inactive for recycled vnode */
4100			/* clear all flags other than these */
4101			nfs_node_lock_force(tnp);
4102			tnp->n_flag &= (NMODIFIED);
4103			nfs_node_unlock(tnp);
4104			vnode_recycle(tvp);
4105		}
4106	}
4107	if (!error) {
4108		nfs_node_lock_force(tdnp);
4109		if (tdnp->n_flag & NNEGNCENTRIES) {
4110			tdnp->n_flag &= ~NNEGNCENTRIES;
4111			cache_purge_negatives(tdvp);
4112		}
4113		nfs_node_unlock(tdnp);
4114		nfs_node_lock_force(fnp);
4115		cache_enter(tdvp, fvp, tcnp);
4116		if (tdvp != fdvp) {	/* update parent pointer */
4117			if (fnp->n_parent && !vnode_get(fnp->n_parent)) {
4118				/* remove ref from old parent */
4119				vnode_rele(fnp->n_parent);
4120				vnode_put(fnp->n_parent);
4121			}
4122			fnp->n_parent = tdvp;
4123			if (tdvp && !vnode_get(tdvp)) {
4124				/* add ref to new parent */
4125				vnode_ref(tdvp);
4126				vnode_put(tdvp);
4127			} else {
4128				fnp->n_parent = NULL;
4129			}
4130		}
4131		nfs_node_unlock(fnp);
4132	}
4133out:
4134	/* nfs_getattr() will check changed and purge caches */
4135	nfs_getattr(fdnp, NULL, ctx, NGA_CACHED);
4136	nfs_getattr(tdnp, NULL, ctx, NGA_CACHED);
4137	if (locked) {
4138		/* unlock node */
4139		lck_mtx_lock(nfs_node_hash_mutex);
4140		tnp->n_hflag &= ~NHLOCKED;
4141		if (tnp->n_hflag & NHLOCKWANT) {
4142			tnp->n_hflag &= ~NHLOCKWANT;
4143			wakeup(tnp);
4144		}
4145		lck_mtx_unlock(nfs_node_hash_mutex);
4146	}
4147	nfs_node_clear_busy4(fdnp, fnp, tdnp, tnp);
4148	return (error);
4149}
4150
4151/*
4152 * Do an NFS rename rpc. Called from nfs_vnop_rename() and nfs_sillyrename().
4153 */
4154int
4155nfs3_rename_rpc(
4156	nfsnode_t fdnp,
4157	char *fnameptr,
4158	int fnamelen,
4159	nfsnode_t tdnp,
4160	char *tnameptr,
4161	int tnamelen,
4162	vfs_context_t ctx)
4163{
4164	int error = 0, lockerror = ENOENT, status, fwccpostattr = 0, twccpostattr = 0;
4165	struct timespec fpremtime = { 0, 0 }, tpremtime = { 0, 0 };
4166	struct nfsmount *nmp;
4167	int nfsvers;
4168	u_int64_t xid, txid;
4169	struct nfsm_chain nmreq, nmrep;
4170
4171	nmp = NFSTONMP(fdnp);
4172	if (!nmp)
4173		return (ENXIO);
4174	nfsvers = nmp->nm_vers;
4175	if ((nfsvers == NFS_VER2) &&
4176	    ((fnamelen > NFS_MAXNAMLEN) || (tnamelen > NFS_MAXNAMLEN)))
4177		return (ENAMETOOLONG);
4178
4179	nfsm_chain_null(&nmreq);
4180	nfsm_chain_null(&nmrep);
4181
4182	nfsm_chain_build_alloc_init(error, &nmreq,
4183		(NFSX_FH(nfsvers) + NFSX_UNSIGNED) * 2 +
4184		nfsm_rndup(fnamelen) + nfsm_rndup(tnamelen));
4185	nfsm_chain_add_fh(error, &nmreq, nfsvers, fdnp->n_fhp, fdnp->n_fhsize);
4186	nfsm_chain_add_name(error, &nmreq, fnameptr, fnamelen, nmp);
4187	nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize);
4188	nfsm_chain_add_name(error, &nmreq, tnameptr, tnamelen, nmp);
4189	nfsm_chain_build_done(error, &nmreq);
4190	nfsmout_if(error);
4191
4192	error = nfs_request(fdnp, NULL, &nmreq, NFSPROC_RENAME, ctx, NULL, &nmrep, &xid, &status);
4193
4194	if ((lockerror = nfs_node_lock2(fdnp, tdnp)))
4195		error = lockerror;
4196	if (nfsvers == NFS_VER3) {
4197		txid = xid;
4198		nfsm_chain_get_wcc_data(error, &nmrep, fdnp, &fpremtime, &fwccpostattr, &xid);
4199		nfsm_chain_get_wcc_data(error, &nmrep, tdnp, &tpremtime, &twccpostattr, &txid);
4200	}
4201	if (!error)
4202		error = status;
4203nfsmout:
4204	nfsm_chain_cleanup(&nmreq);
4205	nfsm_chain_cleanup(&nmrep);
4206	if (!lockerror) {
4207		fdnp->n_flag |= NMODIFIED;
4208		/* if directory hadn't changed, update namecache mtime */
4209		if (nfstimespeccmp(&fdnp->n_ncmtime, &fpremtime, ==))
4210			NFS_CHANGED_UPDATE_NC(nfsvers, fdnp, &fdnp->n_vattr);
4211		if (!fwccpostattr)
4212			NATTRINVALIDATE(fdnp);
4213		tdnp->n_flag |= NMODIFIED;
4214		/* if directory hadn't changed, update namecache mtime */
4215		if (nfstimespeccmp(&tdnp->n_ncmtime, &tpremtime, ==))
4216			NFS_CHANGED_UPDATE_NC(nfsvers, tdnp, &tdnp->n_vattr);
4217		if (!twccpostattr)
4218			NATTRINVALIDATE(tdnp);
4219		nfs_node_unlock2(fdnp, tdnp);
4220	}
4221	return (error);
4222}
4223
4224/*
4225 * NFS hard link create call
4226 */
4227int
4228nfs3_vnop_link(
4229	struct vnop_link_args /* {
4230		struct vnodeop_desc *a_desc;
4231		vnode_t a_vp;
4232		vnode_t a_tdvp;
4233		struct componentname *a_cnp;
4234		vfs_context_t a_context;
4235	} */ *ap)
4236{
4237	vfs_context_t ctx = ap->a_context;
4238	vnode_t vp = ap->a_vp;
4239	vnode_t tdvp = ap->a_tdvp;
4240	struct componentname *cnp = ap->a_cnp;
4241	int error = 0, lockerror = ENOENT, status, wccpostattr = 0, attrflag = 0;
4242	struct timespec premtime = { 0, 0 };
4243	struct nfsmount *nmp;
4244	nfsnode_t np = VTONFS(vp);
4245	nfsnode_t tdnp = VTONFS(tdvp);
4246	int nfsvers;
4247	u_int64_t xid, txid;
4248	struct nfsm_chain nmreq, nmrep;
4249
4250	if (vnode_mount(vp) != vnode_mount(tdvp))
4251		return (EXDEV);
4252
4253	nmp = VTONMP(vp);
4254	if (!nmp)
4255		return (ENXIO);
4256	nfsvers = nmp->nm_vers;
4257	if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN))
4258		return (ENAMETOOLONG);
4259
4260	/*
4261	 * Push all writes to the server, so that the attribute cache
4262	 * doesn't get "out of sync" with the server.
4263	 * XXX There should be a better way!
4264	 */
4265	nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), V_IGNORE_WRITEERR);
4266
4267	error = nfs_node_set_busy2(tdnp, np, vfs_context_thread(ctx));
4268	if (error)
4269		return (error);
4270
4271	nfsm_chain_null(&nmreq);
4272	nfsm_chain_null(&nmrep);
4273
4274	nfsm_chain_build_alloc_init(error, &nmreq,
4275		NFSX_FH(nfsvers)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
4276	nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize);
4277	nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize);
4278	nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp);
4279	nfsm_chain_build_done(error, &nmreq);
4280	nfsmout_if(error);
4281	error = nfs_request(np, NULL, &nmreq, NFSPROC_LINK, ctx, NULL, &nmrep, &xid, &status);
4282
4283	if ((lockerror = nfs_node_lock2(tdnp, np))) {
4284		error = lockerror;
4285		goto nfsmout;
4286	}
4287	if (nfsvers == NFS_VER3) {
4288		txid = xid;
4289		nfsm_chain_postop_attr_update_flag(error, &nmrep, np, attrflag, &xid);
4290		nfsm_chain_get_wcc_data(error, &nmrep, tdnp, &premtime, &wccpostattr, &txid);
4291	}
4292	if (!error)
4293		error = status;
4294nfsmout:
4295	nfsm_chain_cleanup(&nmreq);
4296	nfsm_chain_cleanup(&nmrep);
4297	if (!lockerror) {
4298		if (!attrflag)
4299			NATTRINVALIDATE(np);
4300		tdnp->n_flag |= NMODIFIED;
4301		/* if directory hadn't changed, update namecache mtime */
4302		if (nfstimespeccmp(&tdnp->n_ncmtime, &premtime, ==))
4303			NFS_CHANGED_UPDATE_NC(nfsvers, tdnp, &tdnp->n_vattr);
4304		if (!wccpostattr)
4305			NATTRINVALIDATE(tdnp);
4306		if (!error && (tdnp->n_flag & NNEGNCENTRIES)) {
4307			tdnp->n_flag &= ~NNEGNCENTRIES;
4308			cache_purge_negatives(tdvp);
4309		}
4310		nfs_node_unlock2(tdnp, np);
4311	}
4312	nfs_node_clear_busy2(tdnp, np);
4313	/*
4314	 * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry.
4315	 */
4316	if (error == EEXIST)
4317		error = 0;
4318	return (error);
4319}
4320
4321/*
4322 * NFS symbolic link create call
4323 */
4324int
4325nfs3_vnop_symlink(
4326	struct vnop_symlink_args /* {
4327		struct vnodeop_desc *a_desc;
4328		vnode_t a_dvp;
4329		vnode_t *a_vpp;
4330		struct componentname *a_cnp;
4331		struct vnode_attr *a_vap;
4332		char *a_target;
4333		vfs_context_t a_context;
4334	} */ *ap)
4335{
4336	vfs_context_t ctx = ap->a_context;
4337	vnode_t dvp = ap->a_dvp;
4338	struct vnode_attr *vap = ap->a_vap;
4339	struct componentname *cnp = ap->a_cnp;
4340	struct nfs_vattr nvattr;
4341	fhandle_t fh;
4342	int slen, error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0;
4343	struct timespec premtime = { 0, 0 };
4344	vnode_t newvp = NULL;
4345	int nfsvers, gotuid, gotgid;
4346	u_int64_t xid = 0, dxid;
4347	nfsnode_t np = NULL;
4348	nfsnode_t dnp = VTONFS(dvp);
4349	struct nfsmount *nmp;
4350	struct nfsm_chain nmreq, nmrep;
4351	struct nfsreq rq, *req = &rq;
4352	struct nfs_dulookup dul;
4353
4354	nmp = VTONMP(dvp);
4355	if (!nmp)
4356		return (ENXIO);
4357	nfsvers = nmp->nm_vers;
4358
4359	slen = strlen(ap->a_target);
4360	if ((nfsvers == NFS_VER2) &&
4361	    ((cnp->cn_namelen > NFS_MAXNAMLEN) || (slen > NFS_MAXPATHLEN)))
4362		return (ENAMETOOLONG);
4363
4364	nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx);
4365
4366	VATTR_SET_SUPPORTED(vap, va_mode);
4367	VATTR_SET_SUPPORTED(vap, va_uid);
4368	VATTR_SET_SUPPORTED(vap, va_gid);
4369	VATTR_SET_SUPPORTED(vap, va_data_size);
4370	VATTR_SET_SUPPORTED(vap, va_access_time);
4371	VATTR_SET_SUPPORTED(vap, va_modify_time);
4372	gotuid = VATTR_IS_ACTIVE(vap, va_uid);
4373	gotgid = VATTR_IS_ACTIVE(vap, va_gid);
4374
4375	error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx));
4376	nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
4377
4378	nfsm_chain_null(&nmreq);
4379	nfsm_chain_null(&nmrep);
4380
4381	nfsm_chain_build_alloc_init(error, &nmreq,
4382		NFSX_FH(nfsvers) + 2 * NFSX_UNSIGNED +
4383		nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(nfsvers));
4384	nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
4385	nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp);
4386	if (nfsvers == NFS_VER3)
4387		nfsm_chain_add_v3sattr(error, &nmreq, vap);
4388	nfsm_chain_add_name(error, &nmreq, ap->a_target, slen, nmp);
4389	if (nfsvers == NFS_VER2)
4390		nfsm_chain_add_v2sattr(error, &nmreq, vap, -1);
4391	nfsm_chain_build_done(error, &nmreq);
4392	nfsmout_if(error);
4393
4394	error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_SYMLINK,
4395			vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req);
4396	if (!error) {
4397		nfs_dulookup_start(&dul, dnp, ctx);
4398		error = nfs_request_async_finish(req, &nmrep, &xid, &status);
4399	}
4400
4401	if ((lockerror = nfs_node_lock(dnp)))
4402		error = lockerror;
4403	dxid = xid;
4404	if (!error && !status) {
4405		if (dnp->n_flag & NNEGNCENTRIES) {
4406			dnp->n_flag &= ~NNEGNCENTRIES;
4407			cache_purge_negatives(dvp);
4408		}
4409		if (nfsvers == NFS_VER3)
4410			error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
4411		else
4412			fh.fh_len = 0;
4413	}
4414	if (nfsvers == NFS_VER3)
4415		nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid);
4416	if (!error)
4417		error = status;
4418nfsmout:
4419	nfsm_chain_cleanup(&nmreq);
4420	nfsm_chain_cleanup(&nmrep);
4421
4422	if (!lockerror) {
4423		dnp->n_flag |= NMODIFIED;
4424		/* if directory hadn't changed, update namecache mtime */
4425		if (nfstimespeccmp(&dnp->n_ncmtime, &premtime, ==))
4426			NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr);
4427		nfs_node_unlock(dnp);
4428		/* nfs_getattr() will check changed and purge caches */
4429		nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED);
4430	}
4431
4432	if (!error && fh.fh_len)
4433		error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np);
4434	if (!error && np)
4435		newvp = NFSTOV(np);
4436
4437	nfs_dulookup_finish(&dul, dnp, ctx);
4438
4439	/*
4440	 * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry
4441	 * if we can succeed in looking up the symlink.
4442	 */
4443	if ((error == EEXIST) || (!error && !newvp)) {
4444		if (newvp) {
4445			nfs_node_unlock(np);
4446			vnode_put(newvp);
4447			newvp = NULL;
4448		}
4449		error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np);
4450		if (!error) {
4451			newvp = NFSTOV(np);
4452			if (vnode_vtype(newvp) != VLNK)
4453				error = EEXIST;
4454		}
4455	}
4456	if (!busyerror)
4457		nfs_node_clear_busy(dnp);
4458	if (!error && (gotuid || gotgid) &&
4459	    (!newvp || nfs_getattrcache(np, &nvattr, 0) ||
4460	     (gotuid && (nvattr.nva_uid != vap->va_uid)) ||
4461	     (gotgid && (nvattr.nva_gid != vap->va_gid)))) {
4462		/* clear ID bits if server didn't use them (or we can't tell) */
4463		VATTR_CLEAR_SUPPORTED(vap, va_uid);
4464		VATTR_CLEAR_SUPPORTED(vap, va_gid);
4465	}
4466	if (error) {
4467		if (newvp) {
4468			nfs_node_unlock(np);
4469			vnode_put(newvp);
4470		}
4471	} else {
4472		nfs_node_unlock(np);
4473		*ap->a_vpp = newvp;
4474	}
4475	return (error);
4476}
4477
4478/*
4479 * NFS make dir call
4480 */
4481int
4482nfs3_vnop_mkdir(
4483	struct vnop_mkdir_args /* {
4484		struct vnodeop_desc *a_desc;
4485		vnode_t a_dvp;
4486		vnode_t *a_vpp;
4487		struct componentname *a_cnp;
4488		struct vnode_attr *a_vap;
4489		vfs_context_t a_context;
4490	} */ *ap)
4491{
4492	vfs_context_t ctx = ap->a_context;
4493	vnode_t dvp = ap->a_dvp;
4494	struct vnode_attr *vap = ap->a_vap;
4495	struct componentname *cnp = ap->a_cnp;
4496	struct nfs_vattr nvattr;
4497	nfsnode_t np = NULL;
4498	struct nfsmount *nmp;
4499	nfsnode_t dnp = VTONFS(dvp);
4500	vnode_t newvp = NULL;
4501	int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0;
4502	struct timespec premtime = { 0, 0 };
4503	int nfsvers, gotuid, gotgid;
4504	u_int64_t xid= 0, dxid;
4505	fhandle_t fh;
4506	struct nfsm_chain nmreq, nmrep;
4507	struct nfsreq rq, *req = &rq;
4508	struct nfs_dulookup dul;
4509
4510	nmp = VTONMP(dvp);
4511	if (!nmp)
4512		return (ENXIO);
4513	nfsvers = nmp->nm_vers;
4514	if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN))
4515		return (ENAMETOOLONG);
4516
4517	nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx);
4518
4519	VATTR_SET_SUPPORTED(vap, va_mode);
4520	VATTR_SET_SUPPORTED(vap, va_uid);
4521	VATTR_SET_SUPPORTED(vap, va_gid);
4522	VATTR_SET_SUPPORTED(vap, va_data_size);
4523	VATTR_SET_SUPPORTED(vap, va_access_time);
4524	VATTR_SET_SUPPORTED(vap, va_modify_time);
4525	gotuid = VATTR_IS_ACTIVE(vap, va_uid);
4526	gotgid = VATTR_IS_ACTIVE(vap, va_gid);
4527
4528	error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx));
4529	nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
4530
4531	nfsm_chain_null(&nmreq);
4532	nfsm_chain_null(&nmrep);
4533
4534	nfsm_chain_build_alloc_init(error, &nmreq,
4535		NFSX_FH(nfsvers) + NFSX_UNSIGNED +
4536		nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(nfsvers));
4537	nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
4538	nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp);
4539	if (nfsvers == NFS_VER3)
4540		nfsm_chain_add_v3sattr(error, &nmreq, vap);
4541	else
4542		nfsm_chain_add_v2sattr(error, &nmreq, vap, -1);
4543	nfsm_chain_build_done(error, &nmreq);
4544	nfsmout_if(error);
4545
4546	error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_MKDIR,
4547			vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req);
4548	if (!error) {
4549		nfs_dulookup_start(&dul, dnp, ctx);
4550		error = nfs_request_async_finish(req, &nmrep, &xid, &status);
4551	}
4552
4553	if ((lockerror = nfs_node_lock(dnp)))
4554		error = lockerror;
4555	dxid = xid;
4556	if (!error && !status) {
4557		if (dnp->n_flag & NNEGNCENTRIES) {
4558			dnp->n_flag &= ~NNEGNCENTRIES;
4559			cache_purge_negatives(dvp);
4560		}
4561		error = nfsm_chain_get_fh_attr(&nmrep, dnp, ctx, nfsvers, &xid, &fh, &nvattr);
4562	}
4563	if (nfsvers == NFS_VER3)
4564		nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &dxid);
4565	if (!error)
4566		error = status;
4567nfsmout:
4568	nfsm_chain_cleanup(&nmreq);
4569	nfsm_chain_cleanup(&nmrep);
4570
4571	if (!lockerror) {
4572		dnp->n_flag |= NMODIFIED;
4573		/* if directory hadn't changed, update namecache mtime */
4574		if (nfstimespeccmp(&dnp->n_ncmtime, &premtime, ==))
4575			NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr);
4576		nfs_node_unlock(dnp);
4577		/* nfs_getattr() will check changed and purge caches */
4578		nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED);
4579	}
4580
4581	if (!error && fh.fh_len)
4582		error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np);
4583	if (!error && np)
4584		newvp = NFSTOV(np);
4585
4586	nfs_dulookup_finish(&dul, dnp, ctx);
4587
4588	/*
4589	 * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry
4590	 * if we can succeed in looking up the directory.
4591	 */
4592	if ((error == EEXIST) || (!error && !newvp)) {
4593		if (newvp) {
4594			nfs_node_unlock(np);
4595			vnode_put(newvp);
4596			newvp = NULL;
4597		}
4598		error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np);
4599		if (!error) {
4600			newvp = NFSTOV(np);
4601			if (vnode_vtype(newvp) != VDIR)
4602				error = EEXIST;
4603		}
4604	}
4605	if (!busyerror)
4606		nfs_node_clear_busy(dnp);
4607	if (!error && (gotuid || gotgid) &&
4608	    (!newvp || nfs_getattrcache(np, &nvattr, 0) ||
4609	     (gotuid && (nvattr.nva_uid != vap->va_uid)) ||
4610	     (gotgid && (nvattr.nva_gid != vap->va_gid)))) {
4611		/* clear ID bits if server didn't use them (or we can't tell) */
4612		VATTR_CLEAR_SUPPORTED(vap, va_uid);
4613		VATTR_CLEAR_SUPPORTED(vap, va_gid);
4614	}
4615	if (error) {
4616		if (newvp) {
4617			nfs_node_unlock(np);
4618			vnode_put(newvp);
4619		}
4620	} else {
4621		nfs_node_unlock(np);
4622		*ap->a_vpp = newvp;
4623	}
4624	return (error);
4625}
4626
4627/*
4628 * NFS remove directory call
4629 */
4630int
4631nfs3_vnop_rmdir(
4632	struct vnop_rmdir_args /* {
4633		struct vnodeop_desc *a_desc;
4634		vnode_t a_dvp;
4635		vnode_t a_vp;
4636		struct componentname *a_cnp;
4637		vfs_context_t a_context;
4638	} */ *ap)
4639{
4640	vfs_context_t ctx = ap->a_context;
4641	vnode_t vp = ap->a_vp;
4642	vnode_t dvp = ap->a_dvp;
4643	struct componentname *cnp = ap->a_cnp;
4644	int error = 0, lockerror = ENOENT, status, wccpostattr = 0;
4645	struct timespec premtime = { 0, 0 };
4646	struct nfsmount *nmp;
4647	nfsnode_t np = VTONFS(vp);
4648	nfsnode_t dnp = VTONFS(dvp);
4649	int nfsvers;
4650	u_int64_t xid;
4651	struct nfsm_chain nmreq, nmrep;
4652	struct nfsreq rq, *req = &rq;
4653	struct nfs_dulookup dul;
4654
4655	nmp = VTONMP(vp);
4656	if (!nmp)
4657		return (ENXIO);
4658	nfsvers = nmp->nm_vers;
4659	if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN))
4660		return (ENAMETOOLONG);
4661
4662	if ((error = nfs_node_set_busy2(dnp, np, vfs_context_thread(ctx))))
4663		return (error);
4664
4665	nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx);
4666
4667	nfsm_chain_null(&nmreq);
4668	nfsm_chain_null(&nmrep);
4669
4670	nfsm_chain_build_alloc_init(error, &nmreq,
4671		NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
4672	nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
4673	nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp);
4674	nfsm_chain_build_done(error, &nmreq);
4675	nfsmout_if(error);
4676
4677	error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_RMDIR,
4678			vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req);
4679	if (!error) {
4680		nfs_dulookup_start(&dul, dnp, ctx);
4681		error = nfs_request_async_finish(req, &nmrep, &xid, &status);
4682	}
4683
4684	if ((lockerror = nfs_node_lock(dnp)))
4685		error = lockerror;
4686	if (nfsvers == NFS_VER3)
4687		nfsm_chain_get_wcc_data(error, &nmrep, dnp, &premtime, &wccpostattr, &xid);
4688	if (!error)
4689		error = status;
4690nfsmout:
4691	nfsm_chain_cleanup(&nmreq);
4692	nfsm_chain_cleanup(&nmrep);
4693
4694	if (!lockerror) {
4695		dnp->n_flag |= NMODIFIED;
4696		/* if directory hadn't changed, update namecache mtime */
4697		if (nfstimespeccmp(&dnp->n_ncmtime, &premtime, ==))
4698			NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr);
4699		nfs_node_unlock(dnp);
4700		nfs_name_cache_purge(dnp, np, cnp, ctx);
4701		/* nfs_getattr() will check changed and purge caches */
4702		nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED);
4703	}
4704	nfs_dulookup_finish(&dul, dnp, ctx);
4705	nfs_node_clear_busy2(dnp, np);
4706
4707	/*
4708	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
4709	 */
4710	if (error == ENOENT)
4711		error = 0;
4712	if (!error) {
4713		/*
4714		 * remove nfsnode from hash now so we can't accidentally find it
4715		 * again if another object gets created with the same filehandle
4716		 * before this vnode gets reclaimed
4717		 */
4718		lck_mtx_lock(nfs_node_hash_mutex);
4719		if (np->n_hflag & NHHASHED) {
4720			LIST_REMOVE(np, n_hash);
4721			np->n_hflag &= ~NHHASHED;
4722			FSDBG(266, 0, np, np->n_flag, 0xb1eb1e);
4723		}
4724		lck_mtx_unlock(nfs_node_hash_mutex);
4725	}
4726	return (error);
4727}
4728
4729/*
4730 * NFS readdir call
4731 *
4732 * The incoming "offset" is a directory cookie indicating where in the
4733 * directory entries should be read from.  A zero cookie means start at
4734 * the beginning of the directory.  Any other cookie will be a cookie
4735 * returned from the server.
4736 *
4737 * Using that cookie, determine which buffer (and where in that buffer)
4738 * to start returning entries from.  Buffer logical block numbers are
4739 * the cookies they start at.  If a buffer is found that is not full,
4740 * call into the bio/RPC code to fill it.  The RPC code will probably
4741 * fill several buffers (dropping the first, requiring a re-get).
4742 *
4743 * When done copying entries to the buffer, set the offset to the current
4744 * entry's cookie and enter that cookie in the cookie cache.
4745 *
4746 * Note: because the getdirentries(2) API returns a long-typed offset,
4747 * the incoming offset is a potentially truncated cookie (ptc).
4748 * The cookie matching code is aware of this and will fall back to
4749 * matching only 32 bits of the cookie.
4750 */
4751int
4752nfs_vnop_readdir(
4753	struct vnop_readdir_args /* {
4754		struct vnodeop_desc *a_desc;
4755		vnode_t a_vp;
4756		struct uio *a_uio;
4757		int a_flags;
4758		int *a_eofflag;
4759		int *a_numdirent;
4760		vfs_context_t a_context;
4761	} */ *ap)
4762{
4763	vfs_context_t ctx = ap->a_context;
4764	vnode_t dvp = ap->a_vp;
4765	nfsnode_t dnp = VTONFS(dvp);
4766	struct nfsmount *nmp;
4767	uio_t uio = ap->a_uio;
4768	int error, nfsvers, extended, numdirent, bigcookies, ptc, done;
4769	uint16_t i, iptc, rlen, nlen;
4770	uint64_t cookie, nextcookie, lbn = 0;
4771	struct nfsbuf *bp = NULL;
4772	struct nfs_dir_buf_header *ndbhp;
4773	struct direntry *dp, *dpptc;
4774	struct dirent dent;
4775	char *cp = NULL;
4776	thread_t thd;
4777
4778	nmp = VTONMP(dvp);
4779	if (!nmp)
4780		return (ENXIO);
4781	nfsvers = nmp->nm_vers;
4782	bigcookies = (nmp->nm_state & NFSSTA_BIGCOOKIES);
4783	extended = (ap->a_flags & VNODE_READDIR_EXTENDED);
4784
4785	if (vnode_vtype(dvp) != VDIR)
4786		return (EPERM);
4787
4788	if (ap->a_eofflag)
4789		*ap->a_eofflag = 0;
4790
4791	if (uio_resid(uio) == 0)
4792		return (0);
4793
4794	if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) {
4795		/* trigger directories should never be read, return nothing */
4796		return (0);
4797	}
4798
4799	thd = vfs_context_thread(ctx);
4800	numdirent = done = 0;
4801	nextcookie = uio_offset(uio);
4802	ptc = bigcookies && NFS_DIR_COOKIE_POTENTIALLY_TRUNCATED(nextcookie);
4803
4804	if ((error = nfs_node_lock(dnp)))
4805		goto out;
4806
4807	if (dnp->n_flag & NNEEDINVALIDATE) {
4808		dnp->n_flag &= ~NNEEDINVALIDATE;
4809		nfs_invaldir(dnp);
4810		nfs_node_unlock(dnp);
4811		error = nfs_vinvalbuf(dvp, 0, ctx, 1);
4812		if (!error)
4813			error = nfs_node_lock(dnp);
4814		if (error)
4815			goto out;
4816	}
4817
4818	/*
4819	 * check for need to invalidate when (re)starting at beginning
4820	 */
4821	if (!nextcookie) {
4822		if (dnp->n_flag & NMODIFIED) {
4823			nfs_invaldir(dnp);
4824			nfs_node_unlock(dnp);
4825			if ((error = nfs_vinvalbuf(dvp, 0, ctx, 1)))
4826				goto out;
4827		} else {
4828			nfs_node_unlock(dnp);
4829		}
4830		/* nfs_getattr() will check changed and purge caches */
4831		if ((error = nfs_getattr(dnp, NULL, ctx, NGA_UNCACHED)))
4832			goto out;
4833	} else {
4834		nfs_node_unlock(dnp);
4835	}
4836
4837	error = nfs_dir_cookie_to_lbn(dnp, nextcookie, &ptc, &lbn);
4838	if (error) {
4839		if (error < 0) { /* just hit EOF cookie */
4840			done = 1;
4841			error = 0;
4842		}
4843		if (ap->a_eofflag)
4844			*ap->a_eofflag = 1;
4845	}
4846
4847	while (!error && !done) {
4848		OSAddAtomic64(1, &nfsstats.biocache_readdirs);
4849		cookie = nextcookie;
4850getbuffer:
4851		error = nfs_buf_get(dnp, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
4852		if (error)
4853			goto out;
4854		ndbhp = (struct nfs_dir_buf_header*)bp->nb_data;
4855		if (!ISSET(bp->nb_flags, NB_CACHE) || !ISSET(ndbhp->ndbh_flags, NDB_FULL)) {
4856			if (!ISSET(bp->nb_flags, NB_CACHE)) { /* initialize the buffer */
4857				ndbhp->ndbh_flags = 0;
4858				ndbhp->ndbh_count = 0;
4859				ndbhp->ndbh_entry_end = sizeof(*ndbhp);
4860				ndbhp->ndbh_ncgen = dnp->n_ncgen;
4861			}
4862			error = nfs_buf_readdir(bp, ctx);
4863			if (error == NFSERR_DIRBUFDROPPED)
4864				goto getbuffer;
4865			if (error)
4866				nfs_buf_release(bp, 1);
4867			if (error && (error != ENXIO) && (error != ETIMEDOUT) && (error != EINTR) && (error != ERESTART)) {
4868				if (!nfs_node_lock(dnp)) {
4869					nfs_invaldir(dnp);
4870					nfs_node_unlock(dnp);
4871				}
4872				nfs_vinvalbuf(dvp, 0, ctx, 1);
4873				if (error == NFSERR_BAD_COOKIE)
4874					error = ENOENT;
4875			}
4876			if (error)
4877				goto out;
4878		}
4879
4880		/* find next entry to return */
4881		dp = NFS_DIR_BUF_FIRST_DIRENTRY(bp);
4882		i = 0;
4883		if ((lbn != cookie) && !(ptc && NFS_DIR_COOKIE_SAME32(lbn, cookie))) {
4884			dpptc = NULL;
4885			iptc = 0;
4886			for (; (i < ndbhp->ndbh_count) && (cookie != dp->d_seekoff); i++) {
4887				if (ptc && !dpptc && NFS_DIR_COOKIE_SAME32(cookie, dp->d_seekoff)) {
4888					iptc = i;
4889					dpptc = dp;
4890				}
4891				nextcookie = dp->d_seekoff;
4892				dp = NFS_DIRENTRY_NEXT(dp);
4893			}
4894			if ((i == ndbhp->ndbh_count) && dpptc) {
4895				i = iptc;
4896				dp = dpptc;
4897			}
4898			if (i < ndbhp->ndbh_count) {
4899				nextcookie = dp->d_seekoff;
4900				dp = NFS_DIRENTRY_NEXT(dp);
4901				i++;
4902			}
4903		}
4904		ptc = 0;  /* only have to deal with ptc on first cookie */
4905
4906		/* return as many entries as we can */
4907		for (; i < ndbhp->ndbh_count; i++) {
4908			if (extended) {
4909				rlen = dp->d_reclen;
4910				cp = (char*)dp;
4911			} else {
4912				if (!cp) {
4913					cp = (char*)&dent;
4914					bzero(cp, sizeof(dent));
4915				}
4916				if (dp->d_namlen > (sizeof(dent.d_name) - 1))
4917					nlen = sizeof(dent.d_name) - 1;
4918				else
4919					nlen = dp->d_namlen;
4920				rlen = NFS_DIRENT_LEN(nlen);
4921				dent.d_reclen = rlen;
4922				dent.d_ino = dp->d_ino;
4923				dent.d_type = dp->d_type;
4924				dent.d_namlen = nlen;
4925				strlcpy(dent.d_name, dp->d_name, nlen + 1);
4926			}
4927			/* check that the record fits */
4928			if (rlen > uio_resid(uio)) {
4929				done = 1;
4930				break;
4931			}
4932			if ((error = uiomove(cp, rlen, uio)))
4933				break;
4934			numdirent++;
4935			nextcookie = dp->d_seekoff;
4936			dp = NFS_DIRENTRY_NEXT(dp);
4937		}
4938
4939		if (i == ndbhp->ndbh_count) {
4940			/* hit end of buffer, move to next buffer */
4941			lbn = nextcookie;
4942			/* if we also hit EOF, we're done */
4943			if (ISSET(ndbhp->ndbh_flags, NDB_EOF)) {
4944				done = 1;
4945				if (ap->a_eofflag)
4946					*ap->a_eofflag = 1;
4947			}
4948		}
4949		if (!error)
4950			uio_setoffset(uio, nextcookie);
4951		if (!error && !done && (nextcookie == cookie)) {
4952			printf("nfs readdir cookie didn't change 0x%llx, %d/%d\n", cookie, i, ndbhp->ndbh_count);
4953			error = EIO;
4954		}
4955		nfs_buf_release(bp, 1);
4956	}
4957
4958	if (!error)
4959		nfs_dir_cookie_cache(dnp, nextcookie, lbn);
4960
4961	if (ap->a_numdirent)
4962		*ap->a_numdirent = numdirent;
4963out:
4964	return (error);
4965}
4966
4967
4968/*
4969 * Invalidate cached directory information, except for the actual directory
4970 * blocks (which are invalidated separately).
4971 */
4972void
4973nfs_invaldir(nfsnode_t dnp)
4974{
4975	if (vnode_vtype(NFSTOV(dnp)) != VDIR)
4976		return;
4977	dnp->n_eofcookie = 0;
4978	dnp->n_cookieverf = 0;
4979	if (!dnp->n_cookiecache)
4980		return;
4981	dnp->n_cookiecache->free = 0;
4982	dnp->n_cookiecache->mru = -1;
4983	memset(dnp->n_cookiecache->next, -1, NFSNUMCOOKIES);
4984}
4985
4986/*
4987 * calculate how much space is available for additional directory entries.
4988 */
4989uint32_t
4990nfs_dir_buf_freespace(struct nfsbuf *bp, int rdirplus)
4991{
4992	struct nfs_dir_buf_header *ndbhp = (struct nfs_dir_buf_header*)bp->nb_data;
4993	uint32_t space;
4994
4995	if (!ndbhp)
4996		return (0);
4997	space = bp->nb_bufsize - ndbhp->ndbh_entry_end;
4998	if (rdirplus)
4999		space -= ndbhp->ndbh_count * sizeof(struct nfs_vattr);
5000	return (space);
5001}
5002
5003/*
5004 * add/update a cookie->lbn entry in the directory cookie cache
5005 */
5006void
5007nfs_dir_cookie_cache(nfsnode_t dnp, uint64_t cookie, uint64_t lbn)
5008{
5009	struct nfsdmap *ndcc;
5010	int8_t i, prev;
5011
5012	if (!cookie)
5013		return;
5014
5015	if (nfs_node_lock(dnp))
5016		return;
5017
5018	if (cookie == dnp->n_eofcookie) { /* EOF cookie */
5019		nfs_node_unlock(dnp);
5020		return;
5021	}
5022
5023	ndcc = dnp->n_cookiecache;
5024	if (!ndcc) {
5025		/* allocate the cookie cache structure */
5026		MALLOC_ZONE(dnp->n_cookiecache, struct nfsdmap *,
5027			sizeof(struct nfsdmap), M_NFSDIROFF, M_WAITOK);
5028		if (!dnp->n_cookiecache) {
5029			nfs_node_unlock(dnp);
5030			return;
5031		}
5032		ndcc = dnp->n_cookiecache;
5033		ndcc->free = 0;
5034		ndcc->mru = -1;
5035		memset(ndcc->next, -1, NFSNUMCOOKIES);
5036	}
5037
5038	/*
5039	 * Search the list for this cookie.
5040	 * Keep track of previous and last entries.
5041	 */
5042	prev = -1;
5043	i = ndcc->mru;
5044	while ((i != -1) && (cookie != ndcc->cookies[i].key)) {
5045		if (ndcc->next[i] == -1) /* stop on last entry so we can reuse */
5046			break;
5047		prev = i;
5048		i = ndcc->next[i];
5049	}
5050	if ((i != -1) && (cookie == ndcc->cookies[i].key)) {
5051		/* found it, remove from list */
5052		if (prev != -1)
5053			ndcc->next[prev] = ndcc->next[i];
5054		else
5055			ndcc->mru = ndcc->next[i];
5056	} else {
5057		/* not found, use next free entry or reuse last entry */
5058		if (ndcc->free != NFSNUMCOOKIES)
5059			i = ndcc->free++;
5060		else
5061			ndcc->next[prev] = -1;
5062		ndcc->cookies[i].key = cookie;
5063		ndcc->cookies[i].lbn = lbn;
5064	}
5065	/* insert cookie at head of MRU list */
5066	ndcc->next[i] = ndcc->mru;
5067	ndcc->mru = i;
5068	nfs_node_unlock(dnp);
5069}
5070
5071/*
5072 * Try to map the given directory cookie to a directory buffer (return lbn).
5073 * If we have a possibly truncated cookie (ptc), check for 32-bit matches too.
5074 */
5075int
5076nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp)
5077{
5078	struct nfsdmap *ndcc = dnp->n_cookiecache;
5079	int8_t eofptc, found;
5080	int i, iptc;
5081	struct nfsmount *nmp;
5082	struct nfsbuf *bp, *lastbp;
5083	struct nfsbuflists blist;
5084	struct direntry *dp, *dpptc;
5085	struct nfs_dir_buf_header *ndbhp;
5086
5087	if (!cookie) {  /* initial cookie */
5088		*lbnp = 0;
5089		*ptc = 0;
5090		return (0);
5091	}
5092
5093	if (nfs_node_lock(dnp))
5094		return (ENOENT);
5095
5096	if (cookie == dnp->n_eofcookie) { /* EOF cookie */
5097		nfs_node_unlock(dnp);
5098		OSAddAtomic64(1, &nfsstats.direofcache_hits);
5099		*ptc = 0;
5100		return (-1);
5101	}
5102	/* note if cookie is a 32-bit match with the EOF cookie */
5103	eofptc = *ptc ? NFS_DIR_COOKIE_SAME32(cookie, dnp->n_eofcookie) : 0;
5104	iptc = -1;
5105
5106	/* search the list for the cookie */
5107	for (i = ndcc ? ndcc->mru : -1; i >= 0; i = ndcc->next[i]) {
5108		if (ndcc->cookies[i].key == cookie) {
5109			/* found a match for this cookie */
5110			*lbnp = ndcc->cookies[i].lbn;
5111			nfs_node_unlock(dnp);
5112			OSAddAtomic64(1, &nfsstats.direofcache_hits);
5113			*ptc = 0;
5114			return (0);
5115		}
5116		/* check for 32-bit match */
5117		if (*ptc && (iptc == -1) && NFS_DIR_COOKIE_SAME32(ndcc->cookies[i].key, cookie))
5118			iptc = i;
5119	}
5120	/* exact match not found */
5121	if (eofptc) {
5122		/* but 32-bit match hit the EOF cookie */
5123		nfs_node_unlock(dnp);
5124		OSAddAtomic64(1, &nfsstats.direofcache_hits);
5125		return (-1);
5126	}
5127	if (iptc >= 0) {
5128		/* but 32-bit match got a hit */
5129		*lbnp = ndcc->cookies[iptc].lbn;
5130		nfs_node_unlock(dnp);
5131		OSAddAtomic64(1, &nfsstats.direofcache_hits);
5132		return (0);
5133	}
5134	nfs_node_unlock(dnp);
5135
5136	/*
5137	 * No match found in the cookie cache... hmm...
5138	 * Let's search the directory's buffers for the cookie.
5139	 */
5140	nmp = NFSTONMP(dnp);
5141	if (!nmp)
5142		return (ENXIO);
5143	dpptc = NULL;
5144	found = 0;
5145
5146	lck_mtx_lock(nfs_buf_mutex);
5147	/*
5148	 * Scan the list of buffers, keeping them in order.
5149	 * Note that itercomplete inserts each of the remaining buffers
5150	 * into the head of list (thus reversing the elements).  So, we
5151	 * make sure to iterate through all buffers, inserting them after
5152	 * each other, to keep them in order.
5153	 * Also note: the LIST_INSERT_AFTER(lastbp) is only safe because
5154	 * we don't drop nfs_buf_mutex.
5155	 */
5156	if (!nfs_buf_iterprepare(dnp, &blist, NBI_CLEAN)) {
5157		lastbp = NULL;
5158		while ((bp = LIST_FIRST(&blist))) {
5159			LIST_REMOVE(bp, nb_vnbufs);
5160			if (!lastbp)
5161				LIST_INSERT_HEAD(&dnp->n_cleanblkhd, bp, nb_vnbufs);
5162			else
5163				LIST_INSERT_AFTER(lastbp, bp, nb_vnbufs);
5164			lastbp = bp;
5165			if (found)
5166				continue;
5167			nfs_buf_refget(bp);
5168			if (nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0)) {
5169				/* just skip this buffer */
5170				nfs_buf_refrele(bp);
5171				continue;
5172			}
5173			nfs_buf_refrele(bp);
5174
5175			/* scan the buffer for the cookie */
5176			ndbhp = (struct nfs_dir_buf_header*)bp->nb_data;
5177			dp = NFS_DIR_BUF_FIRST_DIRENTRY(bp);
5178			dpptc = NULL;
5179			for (i=0; (i < ndbhp->ndbh_count) && (cookie != dp->d_seekoff); i++) {
5180				if (*ptc && !dpptc && NFS_DIR_COOKIE_SAME32(cookie, dp->d_seekoff)) {
5181					dpptc = dp;
5182					iptc = i;
5183				}
5184				dp = NFS_DIRENTRY_NEXT(dp);
5185			}
5186			if ((i == ndbhp->ndbh_count) && dpptc) {
5187				/* found only a PTC match */
5188				dp = dpptc;
5189				i = iptc;
5190			} else if (i < ndbhp->ndbh_count) {
5191				*ptc = 0;
5192			}
5193			if (i < (ndbhp->ndbh_count-1)) {
5194				/* next entry is *in* this buffer: return this block */
5195				*lbnp = bp->nb_lblkno;
5196				found = 1;
5197			} else if (i == (ndbhp->ndbh_count-1)) {
5198				/* next entry refers to *next* buffer: return next block */
5199				*lbnp = dp->d_seekoff;
5200				found = 1;
5201			}
5202			nfs_buf_drop(bp);
5203		}
5204		nfs_buf_itercomplete(dnp, &blist, NBI_CLEAN);
5205	}
5206	lck_mtx_unlock(nfs_buf_mutex);
5207	if (found) {
5208		OSAddAtomic64(1, &nfsstats.direofcache_hits);
5209		return (0);
5210	}
5211
5212	/* still not found... oh well, just start a new block */
5213	*lbnp = cookie;
5214	OSAddAtomic64(1, &nfsstats.direofcache_misses);
5215	return (0);
5216}
5217
5218/*
5219 * scan a directory buffer for the given name
5220 * Returns: ESRCH if not found, ENOENT if found invalid, 0 if found
5221 * Note: should only be called with RDIRPLUS directory buffers
5222 */
5223
5224#define NDBS_PURGE	1
5225#define NDBS_UPDATE	2
5226
5227int
5228nfs_dir_buf_search(
5229	struct nfsbuf *bp,
5230	struct componentname *cnp,
5231	fhandle_t *fhp,
5232	struct nfs_vattr *nvap,
5233	uint64_t *xidp,
5234	time_t *attrstampp,
5235	daddr64_t *nextlbnp,
5236	int flags)
5237{
5238	struct direntry *dp;
5239	struct nfs_dir_buf_header *ndbhp;
5240	struct nfs_vattr *nvattrp;
5241	daddr64_t nextlbn = 0;
5242	int i, error = ESRCH, fhlen;
5243
5244	/* scan the buffer for the name */
5245	ndbhp = (struct nfs_dir_buf_header*)bp->nb_data;
5246	dp = NFS_DIR_BUF_FIRST_DIRENTRY(bp);
5247	for (i=0; i < ndbhp->ndbh_count; i++) {
5248		nextlbn = dp->d_seekoff;
5249		if ((cnp->cn_namelen == dp->d_namlen) && !strcmp(cnp->cn_nameptr, dp->d_name)) {
5250			fhlen = dp->d_name[dp->d_namlen+1];
5251			nvattrp = NFS_DIR_BUF_NVATTR(bp, i);
5252			if ((ndbhp->ndbh_ncgen != bp->nb_np->n_ncgen) || (fhp->fh_len == 0) ||
5253			    (nvattrp->nva_type == VNON) || (nvattrp->nva_fileid == 0)) {
5254				/* entry is not valid */
5255				error = ENOENT;
5256				break;
5257			}
5258			if (flags == NDBS_PURGE) {
5259				dp->d_fileno = 0;
5260				bzero(nvattrp, sizeof(*nvattrp));
5261				error = ENOENT;
5262				break;
5263			}
5264			if (flags == NDBS_UPDATE) {
5265				/* update direntry's attrs if fh matches */
5266				if ((fhp->fh_len == fhlen) && !bcmp(&dp->d_name[dp->d_namlen+2], fhp->fh_data, fhlen)) {
5267					bcopy(nvap, nvattrp, sizeof(*nvap));
5268					dp->d_fileno = nvattrp->nva_fileid;
5269					nvattrp->nva_fileid = *xidp;
5270					*(time_t*)(&dp->d_name[dp->d_namlen+2+fhp->fh_len]) = *attrstampp;
5271				}
5272				error = 0;
5273				break;
5274			}
5275			/* copy out fh, attrs, attrstamp, and xid */
5276			fhp->fh_len = fhlen;
5277			bcopy(&dp->d_name[dp->d_namlen+2], fhp->fh_data, MAX(fhp->fh_len, (int)sizeof(fhp->fh_data)));
5278			*attrstampp = *(time_t*)(&dp->d_name[dp->d_namlen+2+fhp->fh_len]);
5279			bcopy(nvattrp, nvap, sizeof(*nvap));
5280			*xidp = nvap->nva_fileid;
5281			nvap->nva_fileid = dp->d_fileno;
5282			error = 0;
5283			break;
5284		}
5285		dp = NFS_DIRENTRY_NEXT(dp);
5286	}
5287	if (nextlbnp)
5288		*nextlbnp = nextlbn;
5289	return (error);
5290}
5291
5292/*
5293 * Look up a name in a directory's buffers.
5294 * Note: should only be called with RDIRPLUS directory buffers
5295 */
5296int
5297nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cnp, vfs_context_t ctx, int purge)
5298{
5299	nfsnode_t newnp;
5300	struct nfsmount *nmp;
5301	int error = 0, i, found = 0, count = 0;
5302	u_int64_t xid;
5303	struct nfs_vattr nvattr;
5304	fhandle_t fh;
5305	time_t attrstamp = 0;
5306	thread_t thd = vfs_context_thread(ctx);
5307	struct nfsbuf *bp, *lastbp, *foundbp;
5308	struct nfsbuflists blist;
5309	daddr64_t lbn, nextlbn;
5310	int dotunder = (cnp->cn_namelen > 2) && (cnp->cn_nameptr[0] == '.') && (cnp->cn_nameptr[1] == '_');
5311
5312	if (!(nmp = NFSTONMP(dnp)))
5313		return (ENXIO);
5314	if (!purge)
5315		*npp = NULL;
5316
5317	/* first check most recent buffer (and next one too) */
5318	lbn = dnp->n_lastdbl;
5319	for (i=0; i < 2; i++) {
5320		if ((error = nfs_buf_get(dnp, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ|NBLK_ONLYVALID, &bp)))
5321			return (error);
5322		if (!bp)
5323			break;
5324		count++;
5325		error = nfs_dir_buf_search(bp, cnp, &fh, &nvattr, &xid, &attrstamp, &nextlbn, purge ? NDBS_PURGE : 0);
5326		nfs_buf_release(bp, 0);
5327		if (error == ESRCH) {
5328			error = 0;
5329		} else {
5330			found = 1;
5331			break;
5332		}
5333		lbn = nextlbn;
5334	}
5335
5336	lck_mtx_lock(nfs_buf_mutex);
5337	if (found) {
5338		dnp->n_lastdbl = lbn;
5339		goto done;
5340	}
5341
5342	/*
5343	 * Scan the list of buffers, keeping them in order.
5344	 * Note that itercomplete inserts each of the remaining buffers
5345	 * into the head of list (thus reversing the elements).  So, we
5346	 * make sure to iterate through all buffers, inserting them after
5347	 * each other, to keep them in order.
5348	 * Also note: the LIST_INSERT_AFTER(lastbp) is only safe because
5349	 * we don't drop nfs_buf_mutex.
5350	 */
5351	if (!nfs_buf_iterprepare(dnp, &blist, NBI_CLEAN)) {
5352		lastbp = foundbp = NULL;
5353		while ((bp = LIST_FIRST(&blist))) {
5354			LIST_REMOVE(bp, nb_vnbufs);
5355			if (!lastbp)
5356				LIST_INSERT_HEAD(&dnp->n_cleanblkhd, bp, nb_vnbufs);
5357			else
5358				LIST_INSERT_AFTER(lastbp, bp, nb_vnbufs);
5359			lastbp = bp;
5360			if (error || found)
5361				continue;
5362			if (!purge && dotunder && (count > 100)) /* don't waste too much time looking for ._ files */
5363				continue;
5364			nfs_buf_refget(bp);
5365			lbn = bp->nb_lblkno;
5366			if (nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0)) {
5367				/* just skip this buffer */
5368				nfs_buf_refrele(bp);
5369				continue;
5370			}
5371			nfs_buf_refrele(bp);
5372			count++;
5373			error = nfs_dir_buf_search(bp, cnp, &fh, &nvattr, &xid, &attrstamp, NULL, purge ? NDBS_PURGE : 0);
5374			if (error == ESRCH) {
5375				error = 0;
5376			} else {
5377				found = 1;
5378				foundbp = bp;
5379			}
5380			nfs_buf_drop(bp);
5381		}
5382		if (found) {
5383			LIST_REMOVE(foundbp, nb_vnbufs);
5384			LIST_INSERT_HEAD(&dnp->n_cleanblkhd, foundbp, nb_vnbufs);
5385			dnp->n_lastdbl = foundbp->nb_lblkno;
5386		}
5387		nfs_buf_itercomplete(dnp, &blist, NBI_CLEAN);
5388	}
5389done:
5390	lck_mtx_unlock(nfs_buf_mutex);
5391
5392	if (!error && found && !purge) {
5393		error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len,
5394				&nvattr, &xid, dnp->n_auth, NG_MAKEENTRY, &newnp);
5395		if (error)
5396			return (error);
5397		newnp->n_attrstamp = attrstamp;
5398		*npp = newnp;
5399		nfs_node_unlock(newnp);
5400		/* check if the dir buffer's attrs are out of date */
5401		if (!nfs_getattr(newnp, &nvattr, ctx, NGA_CACHED) &&
5402		    (newnp->n_attrstamp != attrstamp)) {
5403			/* they are, so update them */
5404			error = nfs_buf_get(dnp, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ|NBLK_ONLYVALID, &bp);
5405			if (!error && bp) {
5406				attrstamp = newnp->n_attrstamp;
5407				xid = newnp->n_xid;
5408				nfs_dir_buf_search(bp, cnp, &fh, &nvattr, &xid, &attrstamp, NULL, NDBS_UPDATE);
5409				nfs_buf_release(bp, 0);
5410			}
5411			error = 0;
5412		}
5413	}
5414
5415	return (error);
5416}
5417
5418/*
5419 * Purge name cache entries for the given node.
5420 * For RDIRPLUS, also invalidate the entry in the directory's buffers.
5421 */
5422void
5423nfs_name_cache_purge(nfsnode_t dnp, nfsnode_t np, struct componentname *cnp, vfs_context_t ctx)
5424{
5425	struct nfsmount *nmp = NFSTONMP(dnp);
5426
5427	cache_purge(NFSTOV(np));
5428	if (nmp && (nmp->nm_vers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS))
5429		nfs_dir_buf_cache_lookup(dnp, NULL, cnp, ctx, 1);
5430}
5431
5432/*
5433 * NFS V3 readdir (plus) RPC.
5434 */
5435int
5436nfs3_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx)
5437{
5438	struct nfsmount *nmp;
5439	int error = 0, lockerror, nfsvers, rdirplus, bigcookies;
5440	int i, status, attrflag, fhflag, more_entries = 1, eof, bp_dropped = 0;
5441	uint32_t nmreaddirsize, nmrsize;
5442	uint32_t namlen, skiplen, fhlen, xlen, attrlen, reclen, space_free, space_needed;
5443	uint64_t cookie, lastcookie, xid, savedxid, fileno;
5444	struct nfsm_chain nmreq, nmrep, nmrepsave;
5445	fhandle_t fh;
5446	struct nfs_vattr *nvattrp;
5447	struct nfs_dir_buf_header *ndbhp;
5448	struct direntry *dp;
5449	char *padstart, padlen;
5450	struct timeval now;
5451
5452	nmp = NFSTONMP(dnp);
5453	if (!nmp)
5454		return (ENXIO);
5455	nfsvers = nmp->nm_vers;
5456	nmreaddirsize = nmp->nm_readdirsize;
5457	nmrsize = nmp->nm_rsize;
5458	bigcookies = nmp->nm_state & NFSSTA_BIGCOOKIES;
5459noplus:
5460	rdirplus = ((nfsvers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) ? 1 : 0;
5461
5462	if ((lockerror = nfs_node_lock(dnp)))
5463		return (lockerror);
5464
5465	/* determine cookie to use, and move dp to the right offset */
5466	ndbhp = (struct nfs_dir_buf_header*)bp->nb_data;
5467	dp = NFS_DIR_BUF_FIRST_DIRENTRY(bp);
5468	if (ndbhp->ndbh_count) {
5469		for (i=0; i < ndbhp->ndbh_count-1; i++)
5470			dp = NFS_DIRENTRY_NEXT(dp);
5471		cookie = dp->d_seekoff;
5472		dp = NFS_DIRENTRY_NEXT(dp);
5473	} else {
5474		cookie = bp->nb_lblkno;
5475		/* increment with every buffer read */
5476		OSAddAtomic64(1, &nfsstats.readdir_bios);
5477	}
5478	lastcookie = cookie;
5479
5480	/*
5481	 * Loop around doing readdir(plus) RPCs of size nm_readdirsize until
5482	 * the buffer is full (or we hit EOF).  Then put the remainder of the
5483	 * results in the next buffer(s).
5484	 */
5485	nfsm_chain_null(&nmreq);
5486	nfsm_chain_null(&nmrep);
5487	while (nfs_dir_buf_freespace(bp, rdirplus) && !(ndbhp->ndbh_flags & NDB_FULL)) {
5488		nfsm_chain_build_alloc_init(error, &nmreq,
5489			NFSX_FH(nfsvers) + NFSX_READDIR(nfsvers) + NFSX_UNSIGNED);
5490		nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
5491		if (nfsvers == NFS_VER3) {
5492			/* opaque values don't need swapping, but as long */
5493			/* as we are consistent about it, it should be ok */
5494			nfsm_chain_add_64(error, &nmreq, cookie);
5495			nfsm_chain_add_64(error, &nmreq, dnp->n_cookieverf);
5496		} else {
5497			nfsm_chain_add_32(error, &nmreq, cookie);
5498		}
5499		nfsm_chain_add_32(error, &nmreq, nmreaddirsize);
5500		if (rdirplus)
5501			nfsm_chain_add_32(error, &nmreq, nmrsize);
5502		nfsm_chain_build_done(error, &nmreq);
5503		nfs_node_unlock(dnp);
5504		lockerror = ENOENT;
5505		nfsmout_if(error);
5506
5507		error = nfs_request(dnp, NULL, &nmreq,
5508				rdirplus ? NFSPROC_READDIRPLUS : NFSPROC_READDIR,
5509				ctx, NULL, &nmrep, &xid, &status);
5510
5511		if ((lockerror = nfs_node_lock(dnp)))
5512			error = lockerror;
5513
5514		savedxid = xid;
5515		if (nfsvers == NFS_VER3)
5516			nfsm_chain_postop_attr_update(error, &nmrep, dnp, &xid);
5517		if (!error)
5518			error = status;
5519		if (nfsvers == NFS_VER3)
5520			nfsm_chain_get_64(error, &nmrep, dnp->n_cookieverf);
5521		nfsm_chain_get_32(error, &nmrep, more_entries);
5522
5523		if (!lockerror) {
5524			nfs_node_unlock(dnp);
5525			lockerror = ENOENT;
5526		}
5527		if (error == NFSERR_NOTSUPP) {
5528			/* oops... it doesn't look like readdirplus is supported */
5529			lck_mtx_lock(&nmp->nm_lock);
5530			NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_RDIRPLUS);
5531			lck_mtx_unlock(&nmp->nm_lock);
5532			goto noplus;
5533		}
5534		nfsmout_if(error);
5535
5536		if (rdirplus)
5537			microuptime(&now);
5538
5539		/* loop through the entries packing them into the buffer */
5540		while (more_entries) {
5541			if (nfsvers == NFS_VER3)
5542				nfsm_chain_get_64(error, &nmrep, fileno);
5543			else
5544				nfsm_chain_get_32(error, &nmrep, fileno);
5545			nfsm_chain_get_32(error, &nmrep, namlen);
5546			nfsmout_if(error);
5547			/* just truncate names that don't fit in direntry.d_name */
5548			if (namlen <= 0) {
5549				error = EBADRPC;
5550				goto nfsmout;
5551			}
5552			if (namlen > (sizeof(dp->d_name)-1)) {
5553				skiplen = namlen - sizeof(dp->d_name) + 1;
5554				namlen = sizeof(dp->d_name) - 1;
5555			} else {
5556				skiplen = 0;
5557			}
5558			/* guess that fh size will be same as parent */
5559			fhlen = rdirplus ? (1 + dnp->n_fhsize) : 0;
5560			xlen = rdirplus ? (fhlen + sizeof(time_t)) : 0;
5561			attrlen = rdirplus ? sizeof(struct nfs_vattr) : 0;
5562			reclen = NFS_DIRENTRY_LEN(namlen + xlen);
5563			space_needed = reclen + attrlen;
5564			space_free = nfs_dir_buf_freespace(bp, rdirplus);
5565			if (space_needed > space_free) {
5566				/*
5567				 * We still have entries to pack, but we've
5568				 * run out of room in the current buffer.
5569				 * So we need to move to the next buffer.
5570				 * The block# for the next buffer is the
5571				 * last cookie in the current buffer.
5572				 */
5573nextbuffer:
5574				ndbhp->ndbh_flags |= NDB_FULL;
5575				nfs_buf_release(bp, 0);
5576				bp_dropped = 1;
5577				bp = NULL;
5578				error = nfs_buf_get(dnp, lastcookie, NFS_DIRBLKSIZ, vfs_context_thread(ctx), NBLK_READ, &bp);
5579				nfsmout_if(error);
5580				/* initialize buffer */
5581				ndbhp = (struct nfs_dir_buf_header*)bp->nb_data;
5582				ndbhp->ndbh_flags = 0;
5583				ndbhp->ndbh_count = 0;
5584				ndbhp->ndbh_entry_end = sizeof(*ndbhp);
5585				ndbhp->ndbh_ncgen = dnp->n_ncgen;
5586				space_free = nfs_dir_buf_freespace(bp, rdirplus);
5587				dp = NFS_DIR_BUF_FIRST_DIRENTRY(bp);
5588				/* increment with every buffer read */
5589				OSAddAtomic64(1, &nfsstats.readdir_bios);
5590			}
5591			nmrepsave = nmrep;
5592			dp->d_fileno = fileno;
5593			dp->d_namlen = namlen;
5594			dp->d_reclen = reclen;
5595			dp->d_type = DT_UNKNOWN;
5596			nfsm_chain_get_opaque(error, &nmrep, namlen, dp->d_name);
5597			nfsmout_if(error);
5598			dp->d_name[namlen] = '\0';
5599			if (skiplen)
5600				nfsm_chain_adv(error, &nmrep,
5601					nfsm_rndup(namlen + skiplen) - nfsm_rndup(namlen));
5602			if (nfsvers == NFS_VER3)
5603				nfsm_chain_get_64(error, &nmrep, cookie);
5604			else
5605				nfsm_chain_get_32(error, &nmrep, cookie);
5606			nfsmout_if(error);
5607			dp->d_seekoff = cookie;
5608			if (!bigcookies && (cookie >> 32) && (nmp == NFSTONMP(dnp))) {
5609				/* we've got a big cookie, make sure flag is set */
5610				lck_mtx_lock(&nmp->nm_lock);
5611				nmp->nm_state |= NFSSTA_BIGCOOKIES;
5612				lck_mtx_unlock(&nmp->nm_lock);
5613				bigcookies = 1;
5614			}
5615			if (rdirplus) {
5616				nvattrp = NFS_DIR_BUF_NVATTR(bp, ndbhp->ndbh_count);
5617				/* check for attributes */
5618				nfsm_chain_get_32(error, &nmrep, attrflag);
5619				nfsmout_if(error);
5620				if (attrflag) {
5621					/* grab attributes */
5622					error = nfs_parsefattr(&nmrep, NFS_VER3, nvattrp);
5623					nfsmout_if(error);
5624					dp->d_type = IFTODT(VTTOIF(nvattrp->nva_type));
5625					/* fileid is already in d_fileno, so stash xid in attrs */
5626					nvattrp->nva_fileid = savedxid;
5627				} else {
5628					/* mark the attributes invalid */
5629					bzero(nvattrp, sizeof(struct nfs_vattr));
5630				}
5631				/* check for file handle */
5632				nfsm_chain_get_32(error, &nmrep, fhflag);
5633				nfsmout_if(error);
5634				if (fhflag) {
5635					nfsm_chain_get_fh(error, &nmrep, NFS_VER3, &fh);
5636					nfsmout_if(error);
5637					fhlen = fh.fh_len + 1;
5638					xlen = fhlen + sizeof(time_t);
5639					reclen = NFS_DIRENTRY_LEN(namlen + xlen);
5640					space_needed = reclen + attrlen;
5641					if (space_needed > space_free) {
5642						/* didn't actually have the room... move on to next buffer */
5643						nmrep = nmrepsave;
5644						goto nextbuffer;
5645					}
5646					/* pack the file handle into the record */
5647					dp->d_name[dp->d_namlen+1] = fh.fh_len;
5648					bcopy(fh.fh_data, &dp->d_name[dp->d_namlen+2], fh.fh_len);
5649				} else {
5650					/* mark the file handle invalid */
5651					fh.fh_len = 0;
5652					fhlen = fh.fh_len + 1;
5653					xlen = fhlen + sizeof(time_t);
5654					reclen = NFS_DIRENTRY_LEN(namlen + xlen);
5655					bzero(&dp->d_name[dp->d_namlen+1], fhlen);
5656				}
5657				*(time_t*)(&dp->d_name[dp->d_namlen+1+fhlen]) = now.tv_sec;
5658				dp->d_reclen = reclen;
5659			}
5660			padstart = dp->d_name + dp->d_namlen + 1 + xlen;
5661			ndbhp->ndbh_count++;
5662			lastcookie = cookie;
5663			/* advance to next direntry in buffer */
5664			dp = NFS_DIRENTRY_NEXT(dp);
5665			ndbhp->ndbh_entry_end = (char*)dp - bp->nb_data;
5666			/* zero out the pad bytes */
5667			padlen = (char*)dp - padstart;
5668			if (padlen > 0)
5669				bzero(padstart, padlen);
5670			/* check for more entries */
5671			nfsm_chain_get_32(error, &nmrep, more_entries);
5672			nfsmout_if(error);
5673		}
5674		/* Finally, get the eof boolean */
5675		nfsm_chain_get_32(error, &nmrep, eof);
5676		nfsmout_if(error);
5677		if (eof) {
5678			ndbhp->ndbh_flags |= (NDB_FULL|NDB_EOF);
5679			nfs_node_lock_force(dnp);
5680			dnp->n_eofcookie = lastcookie;
5681			nfs_node_unlock(dnp);
5682		} else {
5683			more_entries = 1;
5684		}
5685		if (bp_dropped) {
5686			nfs_buf_release(bp, 0);
5687			bp = NULL;
5688			break;
5689		}
5690		if ((lockerror = nfs_node_lock(dnp)))
5691			error = lockerror;
5692		nfsmout_if(error);
5693		nfsm_chain_cleanup(&nmrep);
5694		nfsm_chain_null(&nmreq);
5695	}
5696nfsmout:
5697	if (bp_dropped && bp)
5698		nfs_buf_release(bp, 0);
5699	if (!lockerror)
5700		nfs_node_unlock(dnp);
5701	nfsm_chain_cleanup(&nmreq);
5702	nfsm_chain_cleanup(&nmrep);
5703	return (bp_dropped ? NFSERR_DIRBUFDROPPED : error);
5704}
5705
5706/*
5707 * Silly rename. To make the NFS filesystem that is stateless look a little
5708 * more like the "ufs" a remove of an active vnode is translated to a rename
5709 * to a funny looking filename that is removed by nfs_vnop_inactive on the
5710 * nfsnode. There is the potential for another process on a different client
5711 * to create the same funny name between when the lookitup() fails and the
5712 * rename() completes, but...
5713 */
5714
5715/* format of "random" silly names - includes a number and pid */
5716/* (note: shouldn't exceed size of nfs_sillyrename.nsr_name) */
5717#define NFS_SILLYNAME_FORMAT ".nfs.%08x.%04x"
5718/* starting from zero isn't silly enough */
5719static uint32_t nfs_sillyrename_number = 0x20051025;
5720
5721int
5722nfs_sillyrename(
5723	nfsnode_t dnp,
5724	nfsnode_t np,
5725	struct componentname *cnp,
5726	vfs_context_t ctx)
5727{
5728	struct nfs_sillyrename *nsp;
5729	int error;
5730	short pid;
5731	kauth_cred_t cred;
5732	uint32_t num;
5733	struct nfsmount *nmp;
5734
5735	nmp = NFSTONMP(dnp);
5736	if (!nmp)
5737		return (ENXIO);
5738
5739	nfs_name_cache_purge(dnp, np, cnp, ctx);
5740
5741	MALLOC_ZONE(nsp, struct nfs_sillyrename *,
5742			sizeof (struct nfs_sillyrename), M_NFSREQ, M_WAITOK);
5743	if (!nsp)
5744		return (ENOMEM);
5745	cred = vfs_context_ucred(ctx);
5746	kauth_cred_ref(cred);
5747	nsp->nsr_cred = cred;
5748	nsp->nsr_dnp = dnp;
5749	error = vnode_ref(NFSTOV(dnp));
5750	if (error)
5751		goto bad_norele;
5752
5753	/* Fudge together a funny name */
5754	pid = vfs_context_pid(ctx);
5755	num = OSAddAtomic(1, &nfs_sillyrename_number);
5756	nsp->nsr_namlen = snprintf(nsp->nsr_name, sizeof(nsp->nsr_name),
5757				NFS_SILLYNAME_FORMAT, num, (pid & 0xffff));
5758	if (nsp->nsr_namlen >= (int)sizeof(nsp->nsr_name))
5759		nsp->nsr_namlen = sizeof(nsp->nsr_name) - 1;
5760
5761	/* Try lookitups until we get one that isn't there */
5762	while (nfs_lookitup(dnp, nsp->nsr_name, nsp->nsr_namlen, ctx, NULL) == 0) {
5763		num = OSAddAtomic(1, &nfs_sillyrename_number);
5764		nsp->nsr_namlen = snprintf(nsp->nsr_name, sizeof(nsp->nsr_name),
5765					NFS_SILLYNAME_FORMAT, num, (pid & 0xffff));
5766		if (nsp->nsr_namlen >= (int)sizeof(nsp->nsr_name))
5767			nsp->nsr_namlen = sizeof(nsp->nsr_name) - 1;
5768	}
5769
5770	/* now, do the rename */
5771	error = nmp->nm_funcs->nf_rename_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen,
5772					dnp, nsp->nsr_name, nsp->nsr_namlen, ctx);
5773
5774	/* Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */
5775	if (error == ENOENT)
5776		error = 0;
5777	if (!error) {
5778		nfs_node_lock_force(dnp);
5779		if (dnp->n_flag & NNEGNCENTRIES) {
5780			dnp->n_flag &= ~NNEGNCENTRIES;
5781			cache_purge_negatives(NFSTOV(dnp));
5782		}
5783		nfs_node_unlock(dnp);
5784	}
5785	FSDBG(267, dnp, np, num, error);
5786	if (error)
5787		goto bad;
5788	error = nfs_lookitup(dnp, nsp->nsr_name, nsp->nsr_namlen, ctx, &np);
5789	nfs_node_lock_force(np);
5790	np->n_sillyrename = nsp;
5791	nfs_node_unlock(np);
5792	return (0);
5793bad:
5794	vnode_rele(NFSTOV(dnp));
5795bad_norele:
5796	nsp->nsr_cred = NOCRED;
5797	kauth_cred_unref(&cred);
5798	FREE_ZONE(nsp, sizeof(*nsp), M_NFSREQ);
5799	return (error);
5800}
5801
5802int
5803nfs3_lookup_rpc_async(
5804	nfsnode_t dnp,
5805	char *name,
5806	int namelen,
5807	vfs_context_t ctx,
5808	struct nfsreq **reqp)
5809{
5810	struct nfsmount *nmp;
5811	struct nfsm_chain nmreq;
5812	int error = 0, nfsvers;
5813
5814	nmp = NFSTONMP(dnp);
5815	if (!nmp)
5816		return (ENXIO);
5817	nfsvers = nmp->nm_vers;
5818
5819	nfsm_chain_null(&nmreq);
5820
5821	nfsm_chain_build_alloc_init(error, &nmreq,
5822		NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(namelen));
5823	nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize);
5824	nfsm_chain_add_name(error, &nmreq, name, namelen, nmp);
5825	nfsm_chain_build_done(error, &nmreq);
5826	nfsmout_if(error);
5827	error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_LOOKUP,
5828			vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, reqp);
5829nfsmout:
5830	nfsm_chain_cleanup(&nmreq);
5831	return (error);
5832}
5833
5834int
5835nfs3_lookup_rpc_async_finish(
5836	nfsnode_t dnp,
5837	__unused char *name,
5838	__unused int namelen,
5839	vfs_context_t ctx,
5840	struct nfsreq *req,
5841	u_int64_t *xidp,
5842	fhandle_t *fhp,
5843	struct nfs_vattr *nvap)
5844{
5845	int error = 0, lockerror = ENOENT, status, nfsvers, attrflag;
5846	u_int64_t xid;
5847	struct nfsmount *nmp;
5848	struct nfsm_chain nmrep;
5849
5850	nmp = NFSTONMP(dnp);
5851	nfsvers = nmp->nm_vers;
5852
5853	nfsm_chain_null(&nmrep);
5854
5855	error = nfs_request_async_finish(req, &nmrep, xidp, &status);
5856
5857	if ((lockerror = nfs_node_lock(dnp)))
5858		error = lockerror;
5859	xid = *xidp;
5860	if (error || status) {
5861		if (nfsvers == NFS_VER3)
5862			nfsm_chain_postop_attr_update(error, &nmrep, dnp, &xid);
5863		if (!error)
5864			error = status;
5865		goto nfsmout;
5866	}
5867
5868	nfsmout_if(error || !fhp || !nvap);
5869
5870	/* get the file handle */
5871	nfsm_chain_get_fh(error, &nmrep, nfsvers, fhp);
5872
5873	/* get the attributes */
5874	if (nfsvers == NFS_VER3) {
5875		nfsm_chain_postop_attr_get(error, &nmrep, attrflag, nvap);
5876		nfsm_chain_postop_attr_update(error, &nmrep, dnp, &xid);
5877		if (!error && !attrflag)
5878			error = nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, 0, ctx, nvap, xidp);
5879	} else {
5880		error = nfs_parsefattr(&nmrep, nfsvers, nvap);
5881	}
5882nfsmout:
5883	if (!lockerror)
5884		nfs_node_unlock(dnp);
5885	nfsm_chain_cleanup(&nmrep);
5886	return (error);
5887}
5888
5889/*
5890 * Look up a file name and optionally either update the file handle or
5891 * allocate an nfsnode, depending on the value of npp.
5892 * npp == NULL	--> just do the lookup
5893 * *npp == NULL --> allocate a new nfsnode and make sure attributes are
5894 *			handled too
5895 * *npp != NULL --> update the file handle in the vnode
5896 */
5897int
5898nfs_lookitup(
5899	nfsnode_t dnp,
5900	char *name,
5901	int namelen,
5902	vfs_context_t ctx,
5903	nfsnode_t *npp)
5904{
5905	int error = 0;
5906	nfsnode_t np, newnp = NULL;
5907	u_int64_t xid;
5908	fhandle_t fh;
5909	struct nfsmount *nmp;
5910	struct nfs_vattr nvattr;
5911	struct nfsreq rq, *req = &rq;
5912
5913	nmp = NFSTONMP(dnp);
5914	if (!nmp)
5915		return (ENXIO);
5916
5917	if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXNAME) &&
5918	    (namelen > (int)nmp->nm_fsattr.nfsa_maxname))
5919		return (ENAMETOOLONG);
5920
5921	NVATTR_INIT(&nvattr);
5922
5923	/* check for lookup of "." */
5924	if ((name[0] == '.') && (namelen == 1)) {
5925		/* skip lookup, we know who we are */
5926		fh.fh_len = 0;
5927		newnp = dnp;
5928		goto nfsmout;
5929	}
5930
5931	error = nmp->nm_funcs->nf_lookup_rpc_async(dnp, name, namelen, ctx, &req);
5932	nfsmout_if(error);
5933	error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, name, namelen, ctx, req, &xid, &fh, &nvattr);
5934	nfsmout_if(!npp || error);
5935
5936	if (*npp) {
5937		np = *npp;
5938		if (fh.fh_len != np->n_fhsize) {
5939			u_char *oldbuf = (np->n_fhsize > NFS_SMALLFH) ? np->n_fhp : NULL;
5940			if (fh.fh_len > NFS_SMALLFH) {
5941				MALLOC_ZONE(np->n_fhp, u_char *, fh.fh_len, M_NFSBIGFH, M_WAITOK);
5942				if (!np->n_fhp) {
5943				    np->n_fhp = oldbuf;
5944				    error = ENOMEM;
5945				    goto nfsmout;
5946				}
5947			} else {
5948				np->n_fhp = &np->n_fh[0];
5949			}
5950			if (oldbuf)
5951				FREE_ZONE(oldbuf, np->n_fhsize, M_NFSBIGFH);
5952		}
5953		bcopy(fh.fh_data, np->n_fhp, fh.fh_len);
5954		np->n_fhsize = fh.fh_len;
5955		nfs_node_lock_force(np);
5956		error = nfs_loadattrcache(np, &nvattr, &xid, 0);
5957		nfs_node_unlock(np);
5958		nfsmout_if(error);
5959		newnp = np;
5960	} else if (NFS_CMPFH(dnp, fh.fh_data, fh.fh_len)) {
5961		nfs_node_lock_force(dnp);
5962		if (dnp->n_xid <= xid)
5963			error = nfs_loadattrcache(dnp, &nvattr, &xid, 0);
5964		nfs_node_unlock(dnp);
5965		nfsmout_if(error);
5966		newnp = dnp;
5967	} else {
5968		struct componentname cn, *cnp = &cn;
5969		bzero(cnp, sizeof(*cnp));
5970		cnp->cn_nameptr = name;
5971		cnp->cn_namelen = namelen;
5972		error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len,
5973			    &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np);
5974		nfsmout_if(error);
5975		newnp = np;
5976	}
5977
5978nfsmout:
5979	if (npp && !*npp && !error)
5980		*npp = newnp;
5981	NVATTR_CLEANUP(&nvattr);
5982	return (error);
5983}
5984
5985/*
5986 * set up and initialize a "._" file lookup structure used for
5987 * performing async lookups.
5988 */
5989void
5990nfs_dulookup_init(struct nfs_dulookup *dulp, nfsnode_t dnp, const char *name, int namelen, vfs_context_t ctx)
5991{
5992	int error, du_namelen;
5993	vnode_t du_vp;
5994	struct nfsmount *nmp = NFSTONMP(dnp);
5995
5996	/* check for ._ file in name cache */
5997	dulp->du_flags = 0;
5998	bzero(&dulp->du_cn, sizeof(dulp->du_cn));
5999	du_namelen = namelen + 2;
6000	if (!nmp || NMFLAG(nmp, NONEGNAMECACHE))
6001		return;
6002	if ((namelen >= 2) && (name[0] == '.') && (name[1] == '_'))
6003		return;
6004	if (du_namelen >= (int)sizeof(dulp->du_smallname))
6005		MALLOC(dulp->du_cn.cn_nameptr, char *, du_namelen + 1, M_TEMP, M_WAITOK);
6006	else
6007		dulp->du_cn.cn_nameptr = dulp->du_smallname;
6008	if (!dulp->du_cn.cn_nameptr)
6009		return;
6010	dulp->du_cn.cn_namelen = du_namelen;
6011	snprintf(dulp->du_cn.cn_nameptr, du_namelen + 1, "._%s", name);
6012	dulp->du_cn.cn_nameptr[du_namelen] = '\0';
6013	dulp->du_cn.cn_nameiop = LOOKUP;
6014	dulp->du_cn.cn_flags = MAKEENTRY;
6015
6016	error = cache_lookup(NFSTOV(dnp), &du_vp, &dulp->du_cn);
6017	if (error == -1) {
6018		vnode_put(du_vp);
6019	} else if (!error) {
6020		nmp = NFSTONMP(dnp);
6021		if (nmp && (nmp->nm_vers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) {
6022			/* if rdirplus, try dir buf cache lookup */
6023			nfsnode_t du_np = NULL;
6024			if (!nfs_dir_buf_cache_lookup(dnp, &du_np, &dulp->du_cn, ctx, 0) && du_np) {
6025				/* dir buf cache hit */
6026				du_vp = NFSTOV(du_np);
6027				vnode_put(du_vp);
6028				error = -1;
6029			}
6030		}
6031		if (!error)
6032			dulp->du_flags |= NFS_DULOOKUP_DOIT;
6033	}
6034}
6035
6036/*
6037 * start an async "._" file lookup request
6038 */
6039void
6040nfs_dulookup_start(struct nfs_dulookup *dulp, nfsnode_t dnp, vfs_context_t ctx)
6041{
6042	struct nfsmount *nmp = NFSTONMP(dnp);
6043	struct nfsreq *req = &dulp->du_req;
6044
6045	if (!nmp || !(dulp->du_flags & NFS_DULOOKUP_DOIT) || (dulp->du_flags & NFS_DULOOKUP_INPROG))
6046		return;
6047	if (!nmp->nm_funcs->nf_lookup_rpc_async(dnp, dulp->du_cn.cn_nameptr,
6048			dulp->du_cn.cn_namelen, ctx, &req))
6049		dulp->du_flags |= NFS_DULOOKUP_INPROG;
6050}
6051
6052/*
6053 * finish an async "._" file lookup request and clean up the structure
6054 */
6055void
6056nfs_dulookup_finish(struct nfs_dulookup *dulp, nfsnode_t dnp, vfs_context_t ctx)
6057{
6058	struct nfsmount *nmp = NFSTONMP(dnp);
6059	int error;
6060	nfsnode_t du_np;
6061	u_int64_t xid;
6062	fhandle_t fh;
6063	struct nfs_vattr nvattr;
6064
6065	if (!nmp || !(dulp->du_flags & NFS_DULOOKUP_INPROG))
6066		goto out;
6067
6068	NVATTR_INIT(&nvattr);
6069	error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, dulp->du_cn.cn_nameptr,
6070			dulp->du_cn.cn_namelen, ctx, &dulp->du_req, &xid, &fh, &nvattr);
6071	dulp->du_flags &= ~NFS_DULOOKUP_INPROG;
6072	if (error == ENOENT) {
6073		/* add a negative entry in the name cache */
6074		nfs_node_lock_force(dnp);
6075		cache_enter(NFSTOV(dnp), NULL, &dulp->du_cn);
6076		dnp->n_flag |= NNEGNCENTRIES;
6077		nfs_node_unlock(dnp);
6078	} else if (!error) {
6079		error = nfs_nget(NFSTOMP(dnp), dnp, &dulp->du_cn, fh.fh_data, fh.fh_len,
6080			    &nvattr, &xid, dulp->du_req.r_auth, NG_MAKEENTRY, &du_np);
6081		if (!error) {
6082			nfs_node_unlock(du_np);
6083			vnode_put(NFSTOV(du_np));
6084		}
6085	}
6086	NVATTR_CLEANUP(&nvattr);
6087out:
6088	if (dulp->du_flags & NFS_DULOOKUP_INPROG)
6089		nfs_request_async_cancel(&dulp->du_req);
6090	if (dulp->du_cn.cn_nameptr && (dulp->du_cn.cn_nameptr != dulp->du_smallname))
6091		FREE(dulp->du_cn.cn_nameptr, M_TEMP);
6092}
6093
6094
6095/*
6096 * NFS Version 3 commit RPC
6097 */
6098int
6099nfs3_commit_rpc(
6100	nfsnode_t np,
6101	uint64_t offset,
6102	uint64_t count,
6103	kauth_cred_t cred,
6104	uint64_t wverf)
6105{
6106	struct nfsmount *nmp;
6107	int error = 0, lockerror, status, wccpostattr = 0, nfsvers;
6108	struct timespec premtime = { 0, 0 };
6109	u_int64_t xid, newwverf;
6110	uint32_t count32;
6111	struct nfsm_chain nmreq, nmrep;
6112
6113	nmp = NFSTONMP(np);
6114	FSDBG(521, np, offset, count, nmp ? nmp->nm_state : 0);
6115	if (!nmp)
6116		return (ENXIO);
6117	if (!(nmp->nm_state & NFSSTA_HASWRITEVERF))
6118		return (0);
6119	nfsvers = nmp->nm_vers;
6120
6121	if (count > UINT32_MAX)
6122		count32 = 0;
6123	else
6124		count32 = count;
6125
6126	nfsm_chain_null(&nmreq);
6127	nfsm_chain_null(&nmrep);
6128
6129	nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(NFS_VER3));
6130	nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize);
6131	nfsm_chain_add_64(error, &nmreq, offset);
6132	nfsm_chain_add_32(error, &nmreq, count32);
6133	nfsm_chain_build_done(error, &nmreq);
6134	nfsmout_if(error);
6135	error = nfs_request2(np, NULL, &nmreq, NFSPROC_COMMIT,
6136			current_thread(), cred, NULL, 0, &nmrep, &xid, &status);
6137	if ((lockerror = nfs_node_lock(np)))
6138		error = lockerror;
6139	/* can we do anything useful with the wcc info? */
6140	nfsm_chain_get_wcc_data(error, &nmrep, np, &premtime, &wccpostattr, &xid);
6141	if (!lockerror)
6142		nfs_node_unlock(np);
6143	if (!error)
6144		error = status;
6145	nfsm_chain_get_64(error, &nmrep, newwverf);
6146	nfsmout_if(error);
6147	lck_mtx_lock(&nmp->nm_lock);
6148	if (nmp->nm_verf != newwverf)
6149		nmp->nm_verf = newwverf;
6150	if (wverf != newwverf)
6151		error = NFSERR_STALEWRITEVERF;
6152	lck_mtx_unlock(&nmp->nm_lock);
6153nfsmout:
6154	nfsm_chain_cleanup(&nmreq);
6155	nfsm_chain_cleanup(&nmrep);
6156	return (error);
6157}
6158
6159
6160int
6161nfs_vnop_blockmap(
6162	__unused struct vnop_blockmap_args /* {
6163		struct vnodeop_desc *a_desc;
6164		vnode_t a_vp;
6165		off_t a_foffset;
6166		size_t a_size;
6167		daddr64_t *a_bpn;
6168		size_t *a_run;
6169		void *a_poff;
6170		int a_flags;
6171	} */ *ap)
6172{
6173	return (ENOTSUP);
6174}
6175
6176
6177/*
6178 * fsync vnode op. Just call nfs_flush().
6179 */
6180/* ARGSUSED */
6181int
6182nfs_vnop_fsync(
6183	struct vnop_fsync_args /* {
6184		struct vnodeop_desc *a_desc;
6185		vnode_t a_vp;
6186		int a_waitfor;
6187		vfs_context_t a_context;
6188	} */ *ap)
6189{
6190	return (nfs_flush(VTONFS(ap->a_vp), ap->a_waitfor, vfs_context_thread(ap->a_context), 0));
6191}
6192
6193
6194/*
6195 * Do an NFS pathconf RPC.
6196 */
6197int
6198nfs3_pathconf_rpc(
6199	nfsnode_t np,
6200	struct nfs_fsattr *nfsap,
6201	vfs_context_t ctx)
6202{
6203	u_int64_t xid;
6204	int error = 0, lockerror, status, nfsvers;
6205	struct nfsm_chain nmreq, nmrep;
6206	struct nfsmount *nmp = NFSTONMP(np);
6207	uint32_t val = 0;
6208
6209	if (!nmp)
6210		return (ENXIO);
6211	nfsvers = nmp->nm_vers;
6212
6213	nfsm_chain_null(&nmreq);
6214	nfsm_chain_null(&nmrep);
6215
6216	/* fetch pathconf info from server */
6217	nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(NFS_VER3));
6218	nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize);
6219	nfsm_chain_build_done(error, &nmreq);
6220	nfsmout_if(error);
6221	error = nfs_request(np, NULL, &nmreq, NFSPROC_PATHCONF, ctx, NULL, &nmrep, &xid, &status);
6222	if ((lockerror = nfs_node_lock(np)))
6223		error = lockerror;
6224	nfsm_chain_postop_attr_update(error, &nmrep, np, &xid);
6225	if (!lockerror)
6226		nfs_node_unlock(np);
6227	if (!error)
6228		error = status;
6229	nfsm_chain_get_32(error, &nmrep, nfsap->nfsa_maxlink);
6230	nfsm_chain_get_32(error, &nmrep, nfsap->nfsa_maxname);
6231	nfsm_chain_get_32(error, &nmrep, val);
6232	if (val)
6233		nfsap->nfsa_flags |= NFS_FSFLAG_NO_TRUNC;
6234	nfsm_chain_get_32(error, &nmrep, val);
6235	if (val)
6236		nfsap->nfsa_flags |= NFS_FSFLAG_CHOWN_RESTRICTED;
6237	nfsm_chain_get_32(error, &nmrep, val);
6238	if (val)
6239		nfsap->nfsa_flags |= NFS_FSFLAG_CASE_INSENSITIVE;
6240	nfsm_chain_get_32(error, &nmrep, val);
6241	if (val)
6242		nfsap->nfsa_flags |= NFS_FSFLAG_CASE_PRESERVING;
6243	NFS_BITMAP_SET(nfsap->nfsa_bitmap, NFS_FATTR_MAXLINK);
6244	NFS_BITMAP_SET(nfsap->nfsa_bitmap, NFS_FATTR_MAXNAME);
6245	NFS_BITMAP_SET(nfsap->nfsa_bitmap, NFS_FATTR_NO_TRUNC);
6246	NFS_BITMAP_SET(nfsap->nfsa_bitmap, NFS_FATTR_CHOWN_RESTRICTED);
6247	NFS_BITMAP_SET(nfsap->nfsa_bitmap, NFS_FATTR_CASE_INSENSITIVE);
6248	NFS_BITMAP_SET(nfsap->nfsa_bitmap, NFS_FATTR_CASE_PRESERVING);
6249nfsmout:
6250	nfsm_chain_cleanup(&nmreq);
6251	nfsm_chain_cleanup(&nmrep);
6252	return (error);
6253}
6254
6255/* save pathconf info for NFSv3 mount */
6256void
6257nfs3_pathconf_cache(struct nfsmount *nmp, struct nfs_fsattr *nfsap)
6258{
6259	nmp->nm_fsattr.nfsa_maxlink = nfsap->nfsa_maxlink;
6260	nmp->nm_fsattr.nfsa_maxname = nfsap->nfsa_maxname;
6261	nmp->nm_fsattr.nfsa_flags |= nfsap->nfsa_flags & NFS_FSFLAG_NO_TRUNC;
6262	nmp->nm_fsattr.nfsa_flags |= nfsap->nfsa_flags & NFS_FSFLAG_CHOWN_RESTRICTED;
6263	nmp->nm_fsattr.nfsa_flags |= nfsap->nfsa_flags & NFS_FSFLAG_CASE_INSENSITIVE;
6264	nmp->nm_fsattr.nfsa_flags |= nfsap->nfsa_flags & NFS_FSFLAG_CASE_PRESERVING;
6265	NFS_BITMAP_SET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXLINK);
6266	NFS_BITMAP_SET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXNAME);
6267	NFS_BITMAP_SET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_NO_TRUNC);
6268	NFS_BITMAP_SET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_CHOWN_RESTRICTED);
6269	NFS_BITMAP_SET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_CASE_INSENSITIVE);
6270	NFS_BITMAP_SET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_CASE_PRESERVING);
6271	nmp->nm_state |= NFSSTA_GOTPATHCONF;
6272}
6273
6274/*
6275 * Return POSIX pathconf information applicable to nfs.
6276 *
6277 * The NFS V2 protocol doesn't support this, so just return EINVAL
6278 * for V2.
6279 */
6280/* ARGSUSED */
6281int
6282nfs_vnop_pathconf(
6283	struct vnop_pathconf_args /* {
6284		struct vnodeop_desc *a_desc;
6285		vnode_t a_vp;
6286		int a_name;
6287		int32_t *a_retval;
6288		vfs_context_t a_context;
6289	} */ *ap)
6290{
6291	vnode_t vp = ap->a_vp;
6292	nfsnode_t np = VTONFS(vp);
6293	struct nfsmount *nmp;
6294	struct nfs_fsattr nfsa, *nfsap;
6295	int error = 0;
6296	uint64_t maxFileSize;
6297	uint nbits;
6298
6299	nmp = VTONMP(vp);
6300	if (!nmp)
6301		return (ENXIO);
6302
6303	switch (ap->a_name) {
6304	case _PC_LINK_MAX:
6305	case _PC_NAME_MAX:
6306	case _PC_CHOWN_RESTRICTED:
6307	case _PC_NO_TRUNC:
6308	case _PC_CASE_SENSITIVE:
6309	case _PC_CASE_PRESERVING:
6310		break;
6311	case _PC_FILESIZEBITS:
6312		if (nmp->nm_vers == NFS_VER2) {
6313			*ap->a_retval = 32;
6314			return (0);
6315		}
6316		break;
6317	case _PC_XATTR_SIZE_BITS:
6318		/* Do we support xattrs natively? */
6319		if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)
6320			break;  /* Yes */
6321		/* No... so just return an error */
6322		/* FALLTHROUGH */
6323	default:
6324		/* don't bother contacting the server if we know the answer */
6325		return (EINVAL);
6326	}
6327
6328	if (nmp->nm_vers == NFS_VER2)
6329		return (EINVAL);
6330
6331	lck_mtx_lock(&nmp->nm_lock);
6332	if (nmp->nm_vers == NFS_VER3) {
6333		if (!(nmp->nm_state & NFSSTA_GOTPATHCONF)) {
6334			/* no pathconf info cached */
6335			lck_mtx_unlock(&nmp->nm_lock);
6336			NFS_CLEAR_ATTRIBUTES(nfsa.nfsa_bitmap);
6337			error = nfs3_pathconf_rpc(np, &nfsa, ap->a_context);
6338			if (error)
6339				return (error);
6340			nmp = VTONMP(vp);
6341			if (!nmp)
6342				return (ENXIO);
6343			lck_mtx_lock(&nmp->nm_lock);
6344			if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_HOMOGENEOUS) {
6345				/* all files have the same pathconf info, */
6346				/* so cache a copy of the results */
6347				nfs3_pathconf_cache(nmp, &nfsa);
6348			}
6349			nfsap = &nfsa;
6350		} else {
6351			nfsap = &nmp->nm_fsattr;
6352		}
6353	} else if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_HOMOGENEOUS)) {
6354		/* no pathconf info cached */
6355		lck_mtx_unlock(&nmp->nm_lock);
6356		NFS_CLEAR_ATTRIBUTES(nfsa.nfsa_bitmap);
6357		error = nfs4_pathconf_rpc(np, &nfsa, ap->a_context);
6358		if (error)
6359			return (error);
6360		nmp = VTONMP(vp);
6361		if (!nmp)
6362			return (ENXIO);
6363		lck_mtx_lock(&nmp->nm_lock);
6364		nfsap = &nfsa;
6365	} else {
6366		nfsap = &nmp->nm_fsattr;
6367	}
6368
6369	switch (ap->a_name) {
6370	case _PC_LINK_MAX:
6371		if (NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_MAXLINK))
6372			*ap->a_retval = nfsap->nfsa_maxlink;
6373		else if ((nmp->nm_vers == NFS_VER4) && NFS_BITMAP_ISSET(np->n_vattr.nva_bitmap, NFS_FATTR_MAXLINK))
6374			*ap->a_retval = np->n_vattr.nva_maxlink;
6375		else
6376			error = EINVAL;
6377		break;
6378	case _PC_NAME_MAX:
6379		if (NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_MAXNAME))
6380			*ap->a_retval = nfsap->nfsa_maxname;
6381		else
6382			error = EINVAL;
6383		break;
6384	case _PC_CHOWN_RESTRICTED:
6385		if (NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_CHOWN_RESTRICTED))
6386			*ap->a_retval = (nfsap->nfsa_flags & NFS_FSFLAG_CHOWN_RESTRICTED) ? 200112 /* _POSIX_CHOWN_RESTRICTED */ : 0;
6387		else
6388			error = EINVAL;
6389		break;
6390	case _PC_NO_TRUNC:
6391		if (NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_NO_TRUNC))
6392			*ap->a_retval = (nfsap->nfsa_flags & NFS_FSFLAG_NO_TRUNC) ? 200112 /* _POSIX_NO_TRUNC */ : 0;
6393		else
6394			error = EINVAL;
6395		break;
6396	case _PC_CASE_SENSITIVE:
6397		if (NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_CASE_INSENSITIVE))
6398			*ap->a_retval = (nfsap->nfsa_flags & NFS_FSFLAG_CASE_INSENSITIVE) ? 0 : 1;
6399		else
6400			error = EINVAL;
6401		break;
6402	case _PC_CASE_PRESERVING:
6403		if (NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_CASE_PRESERVING))
6404			*ap->a_retval = (nfsap->nfsa_flags & NFS_FSFLAG_CASE_PRESERVING) ? 1 : 0;
6405		else
6406			error = EINVAL;
6407		break;
6408	case _PC_XATTR_SIZE_BITS: /* same as file size bits if named attrs supported */
6409	case _PC_FILESIZEBITS:
6410		if (!NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_MAXFILESIZE)) {
6411			*ap->a_retval = 64;
6412			error = 0;
6413			break;
6414		}
6415		maxFileSize = nfsap->nfsa_maxfilesize;
6416		nbits = 1;
6417		if (maxFileSize & 0xffffffff00000000ULL) {
6418			nbits += 32;
6419			maxFileSize >>= 32;
6420		}
6421		if (maxFileSize & 0xffff0000) {
6422			nbits += 16;
6423			maxFileSize >>= 16;
6424		}
6425		if (maxFileSize & 0xff00) {
6426			nbits += 8;
6427			maxFileSize >>= 8;
6428		}
6429		if (maxFileSize & 0xf0) {
6430			nbits += 4;
6431			maxFileSize >>= 4;
6432		}
6433		if (maxFileSize & 0xc) {
6434			nbits += 2;
6435			maxFileSize >>= 2;
6436		}
6437		if (maxFileSize & 0x2) {
6438			nbits += 1;
6439		}
6440		*ap->a_retval = nbits;
6441		break;
6442	default:
6443		error = EINVAL;
6444	}
6445
6446	lck_mtx_unlock(&nmp->nm_lock);
6447
6448	return (error);
6449}
6450
6451/*
6452 * Read wrapper for special devices.
6453 */
6454int
6455nfsspec_vnop_read(
6456	struct vnop_read_args /* {
6457		struct vnodeop_desc *a_desc;
6458		vnode_t a_vp;
6459		struct uio *a_uio;
6460		int a_ioflag;
6461		vfs_context_t a_context;
6462	} */ *ap)
6463{
6464	nfsnode_t np = VTONFS(ap->a_vp);
6465	struct timeval now;
6466	int error;
6467
6468	/*
6469	 * Set access flag.
6470	 */
6471	if ((error = nfs_node_lock(np)))
6472		return (error);
6473	np->n_flag |= NACC;
6474	microtime(&now);
6475	np->n_atim.tv_sec = now.tv_sec;
6476	np->n_atim.tv_nsec = now.tv_usec * 1000;
6477	nfs_node_unlock(np);
6478	return (VOCALL(spec_vnodeop_p, VOFFSET(vnop_read), ap));
6479}
6480
6481/*
6482 * Write wrapper for special devices.
6483 */
6484int
6485nfsspec_vnop_write(
6486	struct vnop_write_args /* {
6487		struct vnodeop_desc *a_desc;
6488		vnode_t a_vp;
6489		struct uio *a_uio;
6490		int a_ioflag;
6491		vfs_context_t a_context;
6492	} */ *ap)
6493{
6494	nfsnode_t np = VTONFS(ap->a_vp);
6495	struct timeval now;
6496	int error;
6497
6498	/*
6499	 * Set update flag.
6500	 */
6501	if ((error = nfs_node_lock(np)))
6502		return (error);
6503	np->n_flag |= NUPD;
6504	microtime(&now);
6505	np->n_mtim.tv_sec = now.tv_sec;
6506	np->n_mtim.tv_nsec = now.tv_usec * 1000;
6507	nfs_node_unlock(np);
6508	return (VOCALL(spec_vnodeop_p, VOFFSET(vnop_write), ap));
6509}
6510
6511/*
6512 * Close wrapper for special devices.
6513 *
6514 * Update the times on the nfsnode then do device close.
6515 */
6516int
6517nfsspec_vnop_close(
6518	struct vnop_close_args /* {
6519		struct vnodeop_desc *a_desc;
6520		vnode_t a_vp;
6521		int a_fflag;
6522		vfs_context_t a_context;
6523	} */ *ap)
6524{
6525	vnode_t vp = ap->a_vp;
6526	nfsnode_t np = VTONFS(vp);
6527	struct vnode_attr vattr;
6528	mount_t mp;
6529	int error;
6530
6531	if ((error = nfs_node_lock(np)))
6532		return (error);
6533	if (np->n_flag & (NACC | NUPD)) {
6534		np->n_flag |= NCHG;
6535		if (!vnode_isinuse(vp, 0) && (mp = vnode_mount(vp)) && !vfs_isrdonly(mp)) {
6536			VATTR_INIT(&vattr);
6537			if (np->n_flag & NACC) {
6538				vattr.va_access_time = np->n_atim;
6539				VATTR_SET_ACTIVE(&vattr, va_access_time);
6540			}
6541			if (np->n_flag & NUPD) {
6542				vattr.va_modify_time = np->n_mtim;
6543				VATTR_SET_ACTIVE(&vattr, va_modify_time);
6544			}
6545			nfs_node_unlock(np);
6546			vnode_setattr(vp, &vattr, ap->a_context);
6547		} else {
6548			nfs_node_unlock(np);
6549		}
6550	} else {
6551		nfs_node_unlock(np);
6552	}
6553	return (VOCALL(spec_vnodeop_p, VOFFSET(vnop_close), ap));
6554}
6555
6556#if FIFO
6557extern vnop_t **fifo_vnodeop_p;
6558
6559/*
6560 * Read wrapper for fifos.
6561 */
6562int
6563nfsfifo_vnop_read(
6564	struct vnop_read_args /* {
6565		struct vnodeop_desc *a_desc;
6566		vnode_t a_vp;
6567		struct uio *a_uio;
6568		int a_ioflag;
6569		vfs_context_t a_context;
6570	} */ *ap)
6571{
6572	nfsnode_t np = VTONFS(ap->a_vp);
6573	struct timeval now;
6574	int error;
6575
6576	/*
6577	 * Set access flag.
6578	 */
6579	if ((error = nfs_node_lock(np)))
6580		return (error);
6581	np->n_flag |= NACC;
6582	microtime(&now);
6583	np->n_atim.tv_sec = now.tv_sec;
6584	np->n_atim.tv_nsec = now.tv_usec * 1000;
6585	nfs_node_unlock(np);
6586	return (VOCALL(fifo_vnodeop_p, VOFFSET(vnop_read), ap));
6587}
6588
6589/*
6590 * Write wrapper for fifos.
6591 */
6592int
6593nfsfifo_vnop_write(
6594	struct vnop_write_args /* {
6595		struct vnodeop_desc *a_desc;
6596		vnode_t a_vp;
6597		struct uio *a_uio;
6598		int a_ioflag;
6599		vfs_context_t a_context;
6600	} */ *ap)
6601{
6602	nfsnode_t np = VTONFS(ap->a_vp);
6603	struct timeval now;
6604	int error;
6605
6606	/*
6607	 * Set update flag.
6608	 */
6609	if ((error = nfs_node_lock(np)))
6610		return (error);
6611	np->n_flag |= NUPD;
6612	microtime(&now);
6613	np->n_mtim.tv_sec = now.tv_sec;
6614	np->n_mtim.tv_nsec = now.tv_usec * 1000;
6615	nfs_node_unlock(np);
6616	return (VOCALL(fifo_vnodeop_p, VOFFSET(vnop_write), ap));
6617}
6618
6619/*
6620 * Close wrapper for fifos.
6621 *
6622 * Update the times on the nfsnode then do fifo close.
6623 */
6624int
6625nfsfifo_vnop_close(
6626	struct vnop_close_args /* {
6627		struct vnodeop_desc *a_desc;
6628		vnode_t a_vp;
6629		int a_fflag;
6630		vfs_context_t a_context;
6631	} */ *ap)
6632{
6633	vnode_t vp = ap->a_vp;
6634	nfsnode_t np = VTONFS(vp);
6635	struct vnode_attr vattr;
6636	struct timeval now;
6637	mount_t mp;
6638	int error;
6639
6640	if ((error = nfs_node_lock(np)))
6641		return (error);
6642	if (np->n_flag & (NACC | NUPD)) {
6643		microtime(&now);
6644		if (np->n_flag & NACC) {
6645			np->n_atim.tv_sec = now.tv_sec;
6646			np->n_atim.tv_nsec = now.tv_usec * 1000;
6647		}
6648		if (np->n_flag & NUPD) {
6649			np->n_mtim.tv_sec = now.tv_sec;
6650			np->n_mtim.tv_nsec = now.tv_usec * 1000;
6651		}
6652		np->n_flag |= NCHG;
6653		if (!vnode_isinuse(vp, 1) && (mp = vnode_mount(vp)) && !vfs_isrdonly(mp)) {
6654			VATTR_INIT(&vattr);
6655			if (np->n_flag & NACC) {
6656				vattr.va_access_time = np->n_atim;
6657				VATTR_SET_ACTIVE(&vattr, va_access_time);
6658			}
6659			if (np->n_flag & NUPD) {
6660				vattr.va_modify_time = np->n_mtim;
6661				VATTR_SET_ACTIVE(&vattr, va_modify_time);
6662			}
6663			nfs_node_unlock(np);
6664			vnode_setattr(vp, &vattr, ap->a_context);
6665		} else {
6666			nfs_node_unlock(np);
6667		}
6668	} else {
6669		nfs_node_unlock(np);
6670	}
6671	return (VOCALL(fifo_vnodeop_p, VOFFSET(vnop_close), ap));
6672}
6673#endif /* FIFO */
6674
6675/*ARGSUSED*/
6676int
6677nfs_vnop_ioctl(
6678	struct vnop_ioctl_args /* {
6679		struct vnodeop_desc *a_desc;
6680		vnode_t a_vp;
6681		u_int32_t a_command;
6682		caddr_t a_data;
6683		int a_fflag;
6684		vfs_context_t a_context;
6685	} */ *ap)
6686{
6687	vfs_context_t ctx = ap->a_context;
6688	vnode_t vp = ap->a_vp;
6689	struct nfsmount *mp = VTONMP(vp);
6690	int error = ENOTTY;
6691
6692	if (mp == NULL)
6693		return (ENXIO);
6694
6695	switch (ap->a_command) {
6696
6697	case F_FULLFSYNC:
6698		if (vnode_vfsisrdonly(vp))
6699			return (EROFS);
6700		error = nfs_flush(VTONFS(vp), MNT_WAIT, vfs_context_thread(ctx), 0);
6701		break;
6702	case NFS_FSCTL_DESTROY_CRED:
6703		error = nfs_gss_clnt_ctx_destroy(mp, vfs_context_ucred(ctx));
6704		break;
6705	}
6706
6707	return (error);
6708}
6709
6710/*ARGSUSED*/
6711int
6712nfs_vnop_select(
6713	__unused struct vnop_select_args /* {
6714		struct vnodeop_desc *a_desc;
6715		vnode_t a_vp;
6716		int a_which;
6717		int a_fflags;
6718		void *a_wql;
6719		vfs_context_t a_context;
6720	} */ *ap)
6721{
6722
6723	/*
6724	 * We were once bogusly seltrue() which returns 1.  Is this right?
6725	 */
6726	return (1);
6727}
6728
6729/*
6730 * vnode OP for pagein using UPL
6731 *
6732 * No buffer I/O, just RPCs straight into the mapped pages.
6733 */
6734int
6735nfs_vnop_pagein(
6736	struct vnop_pagein_args /* {
6737		struct vnodeop_desc *a_desc;
6738		vnode_t a_vp;
6739		upl_t a_pl;
6740		vm_offset_t a_pl_offset;
6741		off_t a_f_offset;
6742		size_t a_size;
6743		int a_flags;
6744		vfs_context_t a_context;
6745	} */ *ap)
6746{
6747	vnode_t vp = ap->a_vp;
6748	upl_t pl = ap->a_pl;
6749	size_t size = ap->a_size;
6750	off_t f_offset = ap->a_f_offset;
6751	vm_offset_t pl_offset = ap->a_pl_offset;
6752	int flags = ap->a_flags;
6753	thread_t thd;
6754	kauth_cred_t cred;
6755	nfsnode_t np = VTONFS(vp);
6756	size_t nmrsize, iosize, txsize, rxsize, retsize;
6757	off_t txoffset;
6758	struct nfsmount *nmp;
6759	int error = 0;
6760	vm_offset_t ioaddr, rxaddr;
6761	uio_t uio;
6762	char uio_buf [ UIO_SIZEOF(1) ];
6763	int nofreeupl = flags & UPL_NOCOMMIT;
6764	upl_page_info_t *plinfo;
6765#define MAXPAGINGREQS	16	/* max outstanding RPCs for pagein/pageout */
6766	struct nfsreq *req[MAXPAGINGREQS];
6767	int nextsend, nextwait;
6768	uint32_t stategenid = 0, restart = 0;
6769	kern_return_t kret;
6770
6771	FSDBG(322, np, f_offset, size, flags);
6772	if (pl == (upl_t)NULL)
6773		panic("nfs_pagein: no upl");
6774
6775	if (size <= 0) {
6776		printf("nfs_pagein: invalid size %ld", size);
6777		if (!nofreeupl)
6778			(void) ubc_upl_abort_range(pl, pl_offset, size, 0);
6779		return (EINVAL);
6780	}
6781	if (f_offset < 0 || f_offset >= (off_t)np->n_size || (f_offset & PAGE_MASK_64)) {
6782		if (!nofreeupl)
6783			ubc_upl_abort_range(pl, pl_offset, size,
6784				UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY);
6785		return (EINVAL);
6786	}
6787
6788	thd = vfs_context_thread(ap->a_context);
6789	cred = ubc_getcred(vp);
6790	if (!IS_VALID_CRED(cred))
6791		cred = vfs_context_ucred(ap->a_context);
6792
6793	uio = uio_createwithbuffer(1, f_offset, UIO_SYSSPACE, UIO_READ,
6794		&uio_buf, sizeof(uio_buf));
6795
6796	nmp = VTONMP(vp);
6797	if (!nmp) {
6798		if (!nofreeupl)
6799			ubc_upl_abort_range(pl, pl_offset, size,
6800				UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY);
6801		return (ENXIO);
6802	}
6803	nmrsize = nmp->nm_rsize;
6804
6805	plinfo = ubc_upl_pageinfo(pl);
6806	kret = ubc_upl_map(pl, &ioaddr);
6807	if (kret != KERN_SUCCESS)
6808		panic("nfs_vnop_pagein: ubc_upl_map() failed with (%d)", kret);
6809	ioaddr += pl_offset;
6810
6811tryagain:
6812	if (nmp->nm_vers >= NFS_VER4)
6813		stategenid = nmp->nm_stategenid;
6814	txsize = rxsize = size;
6815	txoffset = f_offset;
6816	rxaddr = ioaddr;
6817
6818	bzero(req, sizeof(req));
6819	nextsend = nextwait = 0;
6820	do {
6821		if (np->n_flag & NREVOKE) {
6822			error = EIO;
6823			break;
6824		}
6825		/* send requests while we need to and have available slots */
6826		while ((txsize > 0) && (req[nextsend] == NULL)) {
6827			iosize = MIN(nmrsize, txsize);
6828			if ((error = nmp->nm_funcs->nf_read_rpc_async(np, txoffset, iosize, thd, cred, NULL, &req[nextsend]))) {
6829				req[nextsend] = NULL;
6830				break;
6831			}
6832			txoffset += iosize;
6833			txsize -= iosize;
6834			nextsend = (nextsend + 1) % MAXPAGINGREQS;
6835		}
6836		/* wait while we need to and break out if more requests to send */
6837		while ((rxsize > 0) && req[nextwait]) {
6838			iosize = retsize = MIN(nmrsize, rxsize);
6839			uio_reset(uio, uio_offset(uio), UIO_SYSSPACE, UIO_READ);
6840			uio_addiov(uio, CAST_USER_ADDR_T(rxaddr), iosize);
6841			FSDBG(322, uio_offset(uio), uio_resid(uio), rxaddr, rxsize);
6842#if UPL_DEBUG
6843			upl_ubc_alias_set(pl, (uintptr_t) current_thread(), (uintptr_t) 2);
6844#endif /* UPL_DEBUG */
6845			OSAddAtomic64(1, &nfsstats.pageins);
6846			error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req[nextwait], uio, &retsize, NULL);
6847			req[nextwait] = NULL;
6848			nextwait = (nextwait + 1) % MAXPAGINGREQS;
6849			if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) {
6850				lck_mtx_lock(&nmp->nm_lock);
6851				if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) {
6852					NP(np, "nfs_vnop_pagein: error %d, initiating recovery", error);
6853					nfs_need_recover(nmp, error);
6854				}
6855				lck_mtx_unlock(&nmp->nm_lock);
6856				restart++;
6857				goto cancel;
6858			}
6859			if (error) {
6860				FSDBG(322, uio_offset(uio), uio_resid(uio), error, -1);
6861				break;
6862			}
6863			if (retsize < iosize) {
6864				/* Just zero fill the rest of the valid area. */
6865				int zcnt = iosize - retsize;
6866				bzero((char *)rxaddr + retsize, zcnt);
6867				FSDBG(324, uio_offset(uio), retsize, zcnt, rxaddr);
6868				uio_update(uio, zcnt);
6869			}
6870			rxaddr += iosize;
6871			rxsize -= iosize;
6872			if (txsize)
6873				break;
6874		}
6875	} while (!error && (txsize || rxsize));
6876
6877	restart = 0;
6878
6879	if (error) {
6880cancel:
6881		/* cancel any outstanding requests */
6882		while (req[nextwait]) {
6883			nfs_request_async_cancel(req[nextwait]);
6884			req[nextwait] = NULL;
6885			nextwait = (nextwait + 1) % MAXPAGINGREQS;
6886		}
6887		if (np->n_flag & NREVOKE) {
6888			error = EIO;
6889		} else if (restart) {
6890			if (restart <= nfs_mount_state_max_restarts(nmp)) { /* guard against no progress */
6891				if (error == NFSERR_GRACE)
6892					tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
6893				if (!(error = nfs_mount_state_wait_for_recovery(nmp)))
6894					goto tryagain;
6895			} else {
6896				NP(np, "nfs_pagein: too many restarts, aborting");
6897			}
6898		}
6899	}
6900
6901	ubc_upl_unmap(pl);
6902
6903	if (!nofreeupl) {
6904		if (error)
6905			ubc_upl_abort_range(pl, pl_offset, size,
6906					    UPL_ABORT_ERROR |
6907					    UPL_ABORT_FREE_ON_EMPTY);
6908		else
6909			ubc_upl_commit_range(pl, pl_offset, size,
6910					     UPL_COMMIT_CLEAR_DIRTY |
6911					     UPL_COMMIT_FREE_ON_EMPTY);
6912	}
6913	return (error);
6914}
6915
6916
6917/*
6918 * the following are needed only by nfs_pageout to know how to handle errors
6919 * see nfs_pageout comments on explanation of actions.
6920 * the errors here are copied from errno.h and errors returned by servers
6921 * are expected to match the same numbers here. If not, our actions maybe
6922 * erroneous.
6923 */
6924char nfs_pageouterrorhandler(int);
6925enum actiontype {NOACTION, DUMP, DUMPANDLOG, RETRY, SEVER};
6926#define NFS_ELAST 88
6927static u_char errorcount[NFS_ELAST+1]; /* better be zeros when initialized */
6928static const char errortooutcome[NFS_ELAST+1] = {
6929	NOACTION,
6930	DUMP,			/* EPERM	1	Operation not permitted */
6931	DUMP,			/* ENOENT	2	No such file or directory */
6932	DUMPANDLOG,		/* ESRCH	3	No such process */
6933	RETRY,			/* EINTR 	4	Interrupted system call */
6934	DUMP,			/* EIO		5	Input/output error */
6935	DUMP,			/* ENXIO	6	Device not configured */
6936	DUMPANDLOG,		/* E2BIG	7	Argument list too long */
6937	DUMPANDLOG,		/* ENOEXEC	8	Exec format error */
6938	DUMPANDLOG,		/* EBADF	9	Bad file descriptor */
6939	DUMPANDLOG,		/* ECHILD	10	No child processes */
6940	DUMPANDLOG,		/* EDEADLK	11	Resource deadlock avoided - was EAGAIN */
6941	RETRY,			/* ENOMEM	12	Cannot allocate memory */
6942	DUMP,			/* EACCES	13	Permission denied */
6943	DUMPANDLOG,		/* EFAULT	14	Bad address */
6944	DUMPANDLOG,		/* ENOTBLK	15	POSIX - Block device required */
6945	RETRY,			/* EBUSY	16	Device busy */
6946	DUMP,			/* EEXIST	17	File exists */
6947	DUMP,			/* EXDEV	18	Cross-device link */
6948	DUMP,			/* ENODEV	19	Operation not supported by device */
6949	DUMP,			/* ENOTDIR	20	Not a directory */
6950	DUMP,			/* EISDIR 	21	Is a directory */
6951	DUMP,			/* EINVAL	22	Invalid argument */
6952	DUMPANDLOG,		/* ENFILE	23	Too many open files in system */
6953	DUMPANDLOG,		/* EMFILE	24	Too many open files */
6954	DUMPANDLOG,		/* ENOTTY	25	Inappropriate ioctl for device */
6955	DUMPANDLOG,		/* ETXTBSY	26	Text file busy - POSIX */
6956	DUMP,			/* EFBIG	27	File too large */
6957	DUMP,			/* ENOSPC	28	No space left on device */
6958	DUMPANDLOG,		/* ESPIPE	29	Illegal seek */
6959	DUMP,			/* EROFS	30	Read-only file system */
6960	DUMP,			/* EMLINK	31	Too many links */
6961	RETRY,			/* EPIPE	32	Broken pipe */
6962	/* math software */
6963	DUMPANDLOG,		/* EDOM				33	Numerical argument out of domain */
6964	DUMPANDLOG,		/* ERANGE			34	Result too large */
6965	RETRY,			/* EAGAIN/EWOULDBLOCK	35	Resource temporarily unavailable */
6966	DUMPANDLOG,		/* EINPROGRESS		36	Operation now in progress */
6967	DUMPANDLOG,		/* EALREADY			37	Operation already in progress */
6968	/* ipc/network software -- argument errors */
6969	DUMPANDLOG,		/* ENOTSOC			38	Socket operation on non-socket */
6970	DUMPANDLOG,		/* EDESTADDRREQ		39	Destination address required */
6971	DUMPANDLOG,		/* EMSGSIZE			40	Message too long */
6972	DUMPANDLOG,		/* EPROTOTYPE		41	Protocol wrong type for socket */
6973	DUMPANDLOG,		/* ENOPROTOOPT		42	Protocol not available */
6974	DUMPANDLOG,		/* EPROTONOSUPPORT	43	Protocol not supported */
6975	DUMPANDLOG,		/* ESOCKTNOSUPPORT	44	Socket type not supported */
6976	DUMPANDLOG,		/* ENOTSUP			45	Operation not supported */
6977	DUMPANDLOG,		/* EPFNOSUPPORT		46	Protocol family not supported */
6978	DUMPANDLOG,		/* EAFNOSUPPORT		47	Address family not supported by protocol family */
6979	DUMPANDLOG,		/* EADDRINUSE		48	Address already in use */
6980	DUMPANDLOG,		/* EADDRNOTAVAIL	49	Can't assign requested address */
6981	/* ipc/network software -- operational errors */
6982	RETRY,			/* ENETDOWN			50	Network is down */
6983	RETRY,			/* ENETUNREACH		51	Network is unreachable */
6984	RETRY,			/* ENETRESET		52	Network dropped connection on reset */
6985	RETRY,			/* ECONNABORTED		53	Software caused connection abort */
6986	RETRY,			/* ECONNRESET		54	Connection reset by peer */
6987	RETRY,			/* ENOBUFS			55	No buffer space available */
6988	RETRY,			/* EISCONN			56	Socket is already connected */
6989	RETRY,			/* ENOTCONN			57	Socket is not connected */
6990	RETRY,			/* ESHUTDOWN		58	Can't send after socket shutdown */
6991	RETRY,			/* ETOOMANYREFS		59	Too many references: can't splice */
6992	RETRY,			/* ETIMEDOUT		60	Operation timed out */
6993	RETRY,			/* ECONNREFUSED		61	Connection refused */
6994
6995	DUMPANDLOG,		/* ELOOP			62	Too many levels of symbolic links */
6996	DUMP,			/* ENAMETOOLONG		63	File name too long */
6997	RETRY,			/* EHOSTDOWN		64	Host is down */
6998	RETRY,			/* EHOSTUNREACH		65	No route to host */
6999	DUMP,			/* ENOTEMPTY		66	Directory not empty */
7000	/* quotas & mush */
7001	DUMPANDLOG,		/* PROCLIM			67	Too many processes */
7002	DUMPANDLOG,		/* EUSERS			68	Too many users */
7003	DUMPANDLOG,		/* EDQUOT			69	Disc quota exceeded */
7004	/* Network File System */
7005	DUMP,			/* ESTALE			70	Stale NFS file handle */
7006	DUMP,			/* EREMOTE			71	Too many levels of remote in path */
7007	DUMPANDLOG,		/* EBADRPC			72	RPC struct is bad */
7008	DUMPANDLOG,		/* ERPCMISMATCH		73	RPC version wrong */
7009	DUMPANDLOG,		/* EPROGUNAVAIL		74	RPC prog. not avail */
7010	DUMPANDLOG,		/* EPROGMISMATCH	75	Program version wrong */
7011	DUMPANDLOG,		/* EPROCUNAVAIL		76	Bad procedure for program */
7012
7013	DUMPANDLOG,		/* ENOLCK			77	No locks available */
7014	DUMPANDLOG,		/* ENOSYS			78	Function not implemented */
7015	DUMPANDLOG,		/* EFTYPE			79	Inappropriate file type or format */
7016	DUMPANDLOG,		/* EAUTH			80	Authentication error */
7017	DUMPANDLOG,		/* ENEEDAUTH		81	Need authenticator */
7018	/* Intelligent device errors */
7019	DUMPANDLOG,		/* EPWROFF			82	Device power is off */
7020	DUMPANDLOG,		/* EDEVERR			83	Device error, e.g. paper out */
7021	DUMPANDLOG,		/* EOVERFLOW		84	Value too large to be stored in data type */
7022	/* Program loading errors */
7023	DUMPANDLOG,		/* EBADEXEC			85	Bad executable */
7024	DUMPANDLOG,		/* EBADARCH			86	Bad CPU type in executable */
7025	DUMPANDLOG,		/* ESHLIBVERS		87	Shared library version mismatch */
7026	DUMPANDLOG,		/* EBADMACHO		88	Malformed Macho file */
7027};
7028
7029char
7030nfs_pageouterrorhandler(int error)
7031{
7032	if (error > NFS_ELAST)
7033		return(DUMP);
7034	else
7035		return(errortooutcome[error]);
7036}
7037
7038
7039/*
7040 * vnode OP for pageout using UPL
7041 *
7042 * No buffer I/O, just RPCs straight from the mapped pages.
7043 * File size changes are not permitted in pageout.
7044 */
7045int
7046nfs_vnop_pageout(
7047	struct vnop_pageout_args /* {
7048		struct vnodeop_desc *a_desc;
7049		vnode_t a_vp;
7050		upl_t a_pl;
7051		vm_offset_t a_pl_offset;
7052		off_t a_f_offset;
7053		size_t a_size;
7054		int a_flags;
7055		vfs_context_t a_context;
7056	} */ *ap)
7057{
7058	vnode_t vp = ap->a_vp;
7059	upl_t pl = ap->a_pl;
7060	size_t size = ap->a_size;
7061	off_t f_offset = ap->a_f_offset;
7062	vm_offset_t pl_offset = ap->a_pl_offset;
7063	int flags = ap->a_flags;
7064	nfsnode_t np = VTONFS(vp);
7065	thread_t thd;
7066	kauth_cred_t cred;
7067	struct nfsbuf *bp;
7068	struct nfsmount *nmp = VTONMP(vp);
7069	daddr64_t lbn;
7070	int error = 0, iomode;
7071	off_t off, txoffset, rxoffset;
7072	vm_offset_t ioaddr, txaddr, rxaddr;
7073	uio_t auio;
7074	char uio_buf [ UIO_SIZEOF(1) ];
7075	int nofreeupl = flags & UPL_NOCOMMIT;
7076	size_t nmwsize, biosize, iosize, pgsize, txsize, rxsize, xsize, remsize;
7077	struct nfsreq *req[MAXPAGINGREQS];
7078	int nextsend, nextwait, wverfset, commit;
7079	uint64_t wverf, wverf2;
7080	uint32_t stategenid = 0, vrestart = 0, restart = 0, vrestarts = 0, restarts = 0;
7081	kern_return_t kret;
7082
7083	FSDBG(323, f_offset, size, pl, pl_offset);
7084
7085	if (pl == (upl_t)NULL)
7086		panic("nfs_pageout: no upl");
7087
7088	if (size <= 0) {
7089		printf("nfs_pageout: invalid size %ld", size);
7090		if (!nofreeupl)
7091			ubc_upl_abort_range(pl, pl_offset, size, 0);
7092		return (EINVAL);
7093	}
7094
7095	if (!nmp) {
7096		if (!nofreeupl)
7097			ubc_upl_abort(pl, UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY);
7098		return (ENXIO);
7099	}
7100	biosize = nmp->nm_biosize;
7101	nmwsize = nmp->nm_wsize;
7102
7103	nfs_data_lock_noupdate(np, NFS_DATA_LOCK_SHARED);
7104
7105	/*
7106	 * Check to see whether the buffer is incore.
7107	 * If incore and not busy, invalidate it from the cache.
7108	 */
7109	for (iosize = 0; iosize < size; iosize += xsize) {
7110		off = f_offset + iosize;
7111		/* need make sure we do things on block boundaries */
7112		xsize = biosize - (off % biosize);
7113		if (off + xsize > f_offset + size)
7114			xsize = f_offset + size - off;
7115		lbn = (daddr64_t)(off / biosize);
7116		lck_mtx_lock(nfs_buf_mutex);
7117		if ((bp = nfs_buf_incore(np, lbn))) {
7118			FSDBG(323, off, bp, bp->nb_lflags, bp->nb_flags);
7119			if (nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0)) {
7120				lck_mtx_unlock(nfs_buf_mutex);
7121				nfs_data_unlock_noupdate(np);
7122				/* no panic. just tell vm we are busy */
7123				if (!nofreeupl)
7124					ubc_upl_abort_range(pl, pl_offset, size, 0);
7125				return (EBUSY);
7126			}
7127			if (bp->nb_dirtyend > 0) {
7128				/*
7129				 * if there's a dirty range in the buffer, check
7130				 * to see if it extends beyond the pageout region
7131				 *
7132				 * if the dirty region lies completely within the
7133				 * pageout region, we just invalidate the buffer
7134				 * because it's all being written out now anyway.
7135				 *
7136				 * if any of the dirty region lies outside the
7137				 * pageout region, we'll try to clip the dirty
7138				 * region to eliminate the portion that's being
7139				 * paged out.  If that's not possible, because
7140				 * the dirty region extends before and after the
7141				 * pageout region, then we'll just return EBUSY.
7142				 */
7143				off_t boff, start, end;
7144				boff = NBOFF(bp);
7145				start = off;
7146				end = off + xsize;
7147				/* clip end to EOF */
7148				if (end > (off_t)np->n_size)
7149					end = np->n_size;
7150				start -= boff;
7151				end -= boff;
7152				if ((bp->nb_dirtyoff < start) &&
7153				    (bp->nb_dirtyend > end)) {
7154				    /*
7155				     * not gonna be able to clip the dirty region
7156				     *
7157				     * But before returning the bad news, move the
7158				     * buffer to the start of the delwri list and
7159				     * give the list a push to try to flush the
7160				     * buffer out.
7161				     */
7162				    FSDBG(323, np, bp, 0xd00deebc, EBUSY);
7163				    nfs_buf_remfree(bp);
7164				    TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
7165				    nfsbufdelwricnt++;
7166				    nfs_buf_drop(bp);
7167				    nfs_buf_delwri_push(1);
7168				    lck_mtx_unlock(nfs_buf_mutex);
7169				    nfs_data_unlock_noupdate(np);
7170				    if (!nofreeupl)
7171					    ubc_upl_abort_range(pl, pl_offset, size, 0);
7172				    return (EBUSY);
7173				}
7174				if ((bp->nb_dirtyoff < start) ||
7175				    (bp->nb_dirtyend > end)) {
7176				    /* clip dirty region, if necessary */
7177				    if (bp->nb_dirtyoff < start)
7178					bp->nb_dirtyend = min(bp->nb_dirtyend, start);
7179				    if (bp->nb_dirtyend > end)
7180					bp->nb_dirtyoff = max(bp->nb_dirtyoff, end);
7181				    FSDBG(323, bp, bp->nb_dirtyoff, bp->nb_dirtyend, 0xd00dee00);
7182				    /* we're leaving this block dirty */
7183				    nfs_buf_drop(bp);
7184				    lck_mtx_unlock(nfs_buf_mutex);
7185				    continue;
7186				}
7187			}
7188			nfs_buf_remfree(bp);
7189			lck_mtx_unlock(nfs_buf_mutex);
7190			SET(bp->nb_flags, NB_INVAL);
7191			nfs_node_lock_force(np);
7192			if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
7193				CLR(bp->nb_flags, NB_NEEDCOMMIT);
7194				np->n_needcommitcnt--;
7195				CHECK_NEEDCOMMITCNT(np);
7196			}
7197			nfs_node_unlock(np);
7198			nfs_buf_release(bp, 1);
7199		} else {
7200			lck_mtx_unlock(nfs_buf_mutex);
7201		}
7202	}
7203
7204	thd = vfs_context_thread(ap->a_context);
7205	cred = ubc_getcred(vp);
7206	if (!IS_VALID_CRED(cred))
7207		cred = vfs_context_ucred(ap->a_context);
7208
7209	nfs_node_lock_force(np);
7210	if (np->n_flag & NWRITEERR) {
7211		error = np->n_error;
7212		nfs_node_unlock(np);
7213		nfs_data_unlock_noupdate(np);
7214		if (!nofreeupl)
7215			ubc_upl_abort_range(pl, pl_offset, size,
7216					    UPL_ABORT_FREE_ON_EMPTY);
7217		return (error);
7218	}
7219	nfs_node_unlock(np);
7220
7221	if (f_offset < 0 || f_offset >= (off_t)np->n_size ||
7222	    f_offset & PAGE_MASK_64 || size & PAGE_MASK_64) {
7223		nfs_data_unlock_noupdate(np);
7224		if (!nofreeupl)
7225			ubc_upl_abort_range(pl, pl_offset, size,
7226					    UPL_ABORT_FREE_ON_EMPTY);
7227		return (EINVAL);
7228	}
7229
7230	kret = ubc_upl_map(pl, &ioaddr);
7231	if (kret != KERN_SUCCESS)
7232		panic("nfs_vnop_pageout: ubc_upl_map() failed with (%d)", kret);
7233	ioaddr += pl_offset;
7234
7235	if ((u_quad_t)f_offset + size > np->n_size)
7236		xsize = np->n_size - f_offset;
7237	else
7238		xsize = size;
7239
7240	pgsize = round_page_64(xsize);
7241	if ((size > pgsize) && !nofreeupl)
7242		ubc_upl_abort_range(pl, pl_offset + pgsize, size - pgsize,
7243				    UPL_ABORT_FREE_ON_EMPTY);
7244
7245	/*
7246	 * check for partial page and clear the
7247	 * contents past end of the file before
7248	 * releasing it in the VM page cache
7249	 */
7250	if ((u_quad_t)f_offset < np->n_size && (u_quad_t)f_offset + size > np->n_size) {
7251		size_t io = np->n_size - f_offset;
7252		bzero((caddr_t)(ioaddr + io), size - io);
7253		FSDBG(321, np->n_size, f_offset, f_offset + io, size - io);
7254	}
7255	nfs_data_unlock_noupdate(np);
7256
7257	auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
7258		&uio_buf, sizeof(uio_buf));
7259
7260tryagain:
7261	if (nmp->nm_vers >= NFS_VER4)
7262		stategenid = nmp->nm_stategenid;
7263	wverf = wverf2 = wverfset = 0;
7264	txsize = rxsize = xsize;
7265	txoffset = rxoffset = f_offset;
7266	txaddr = rxaddr = ioaddr;
7267	commit = NFS_WRITE_FILESYNC;
7268
7269	bzero(req, sizeof(req));
7270	nextsend = nextwait = 0;
7271	do {
7272		if (np->n_flag & NREVOKE) {
7273			error = EIO;
7274			break;
7275		}
7276		/* send requests while we need to and have available slots */
7277		while ((txsize > 0) && (req[nextsend] == NULL)) {
7278			iosize = MIN(nmwsize, txsize);
7279			uio_reset(auio, txoffset, UIO_SYSSPACE, UIO_WRITE);
7280			uio_addiov(auio, CAST_USER_ADDR_T(txaddr), iosize);
7281			FSDBG(323, uio_offset(auio), iosize, txaddr, txsize);
7282			OSAddAtomic64(1, &nfsstats.pageouts);
7283			nfs_node_lock_force(np);
7284			np->n_numoutput++;
7285			nfs_node_unlock(np);
7286			vnode_startwrite(vp);
7287			iomode = NFS_WRITE_UNSTABLE;
7288			if ((error = nmp->nm_funcs->nf_write_rpc_async(np, auio, iosize, thd, cred, iomode, NULL, &req[nextsend]))) {
7289				req[nextsend] = NULL;
7290				vnode_writedone(vp);
7291				nfs_node_lock_force(np);
7292				np->n_numoutput--;
7293				nfs_node_unlock(np);
7294				break;
7295			}
7296			txaddr += iosize;
7297			txoffset += iosize;
7298			txsize -= iosize;
7299			nextsend = (nextsend + 1) % MAXPAGINGREQS;
7300		}
7301		/* wait while we need to and break out if more requests to send */
7302		while ((rxsize > 0) && req[nextwait]) {
7303			iosize = remsize = MIN(nmwsize, rxsize);
7304			error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req[nextwait], &iomode, &iosize, &wverf2);
7305			req[nextwait] = NULL;
7306			nextwait = (nextwait + 1) % MAXPAGINGREQS;
7307			vnode_writedone(vp);
7308			nfs_node_lock_force(np);
7309			np->n_numoutput--;
7310			nfs_node_unlock(np);
7311			if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) {
7312				lck_mtx_lock(&nmp->nm_lock);
7313				if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) {
7314					NP(np, "nfs_vnop_pageout: error %d, initiating recovery", error);
7315					nfs_need_recover(nmp, error);
7316				}
7317				lck_mtx_unlock(&nmp->nm_lock);
7318				restart = 1;
7319				goto cancel;
7320			}
7321			if (error) {
7322				FSDBG(323, rxoffset, rxsize, error, -1);
7323				break;
7324			}
7325			if (!wverfset) {
7326				wverf = wverf2;
7327				wverfset = 1;
7328			} else if (wverf != wverf2) {
7329				/* verifier changed, so we need to restart all the writes */
7330				vrestart = 1;
7331				goto cancel;
7332			}
7333			/* Retain the lowest commitment level returned. */
7334			if (iomode < commit)
7335				commit = iomode;
7336			rxaddr += iosize;
7337			rxoffset += iosize;
7338			rxsize -= iosize;
7339			remsize -= iosize;
7340			if (remsize > 0) {
7341				/* need to try sending the remainder */
7342				iosize = remsize;
7343				uio_reset(auio, rxoffset, UIO_SYSSPACE, UIO_WRITE);
7344				uio_addiov(auio, CAST_USER_ADDR_T(rxaddr), remsize);
7345				iomode = NFS_WRITE_UNSTABLE;
7346				error = nfs_write_rpc2(np, auio, thd, cred, &iomode, &wverf2);
7347				if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) {
7348					NP(np, "nfs_vnop_pageout: restart: error %d", error);
7349					lck_mtx_lock(&nmp->nm_lock);
7350					if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) {
7351						NP(np, "nfs_vnop_pageout: error %d, initiating recovery", error);
7352						nfs_need_recover(nmp, error);
7353					}
7354					lck_mtx_unlock(&nmp->nm_lock);
7355					restart = 1;
7356					goto cancel;
7357				}
7358				if (error) {
7359					FSDBG(323, rxoffset, rxsize, error, -1);
7360					break;
7361				}
7362				if (wverf != wverf2) {
7363					/* verifier changed, so we need to restart all the writes */
7364					vrestart = 1;
7365					goto cancel;
7366				}
7367				if (iomode < commit)
7368					commit = iomode;
7369				rxaddr += iosize;
7370				rxoffset += iosize;
7371				rxsize -= iosize;
7372			}
7373			if (txsize)
7374				break;
7375		}
7376	} while (!error && (txsize || rxsize));
7377
7378	vrestart = 0;
7379
7380	if (!error && (commit != NFS_WRITE_FILESYNC)) {
7381		error = nmp->nm_funcs->nf_commit_rpc(np, f_offset, xsize, cred, wverf);
7382		if (error == NFSERR_STALEWRITEVERF) {
7383			vrestart = 1;
7384			error = EIO;
7385		}
7386	}
7387
7388	if (error) {
7389cancel:
7390		/* cancel any outstanding requests */
7391		while (req[nextwait]) {
7392			nfs_request_async_cancel(req[nextwait]);
7393			req[nextwait] = NULL;
7394			nextwait = (nextwait + 1) % MAXPAGINGREQS;
7395			vnode_writedone(vp);
7396			nfs_node_lock_force(np);
7397			np->n_numoutput--;
7398			nfs_node_unlock(np);
7399		}
7400		if (np->n_flag & NREVOKE) {
7401			error = EIO;
7402		} else {
7403			if (vrestart) {
7404				if (++vrestarts <= 100) /* guard against no progress */
7405					goto tryagain;
7406				NP(np, "nfs_pageout: too many restarts, aborting");
7407				FSDBG(323, f_offset, xsize, ERESTART, -1);
7408			}
7409			if (restart) {
7410				if (restarts <= nfs_mount_state_max_restarts(nmp)) { /* guard against no progress */
7411					if (error == NFSERR_GRACE)
7412						tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
7413					if (!(error = nfs_mount_state_wait_for_recovery(nmp)))
7414						goto tryagain;
7415				} else {
7416					NP(np, "nfs_pageout: too many restarts, aborting");
7417					FSDBG(323, f_offset, xsize, ERESTART, -1);
7418				}
7419			}
7420		}
7421	}
7422
7423	ubc_upl_unmap(pl);
7424
7425	/*
7426	 * We've had several different solutions on what to do when the pageout
7427	 * gets an error. If we don't handle it, and return an error to the
7428	 * caller, vm, it will retry . This can end in endless looping
7429	 * between vm and here doing retries of the same page. Doing a dump
7430	 * back to vm, will get it out of vm's knowledge and we lose whatever
7431	 * data existed. This is risky, but in some cases necessary. For
7432	 * example, the initial fix here was to do that for ESTALE. In that case
7433	 * the server is telling us that the file is no longer the same. We
7434	 * would not want to keep paging out to that. We also saw some 151
7435	 * errors from Auspex server and NFSv3 can return errors higher than
7436	 * ELAST. Those along with NFS known server errors we will "dump" from
7437	 * vm.  Errors we don't expect to occur, we dump and log for further
7438	 * analysis. Errors that could be transient, networking ones,
7439	 * we let vm "retry". Lastly, errors that we retry, but may have potential
7440	 * to storm the network, we "retrywithsleep". "sever" will be used in
7441	 * in the future to dump all pages of object for cases like ESTALE.
7442	 * All this is the basis for the states returned and first guesses on
7443	 * error handling. Tweaking expected as more statistics are gathered.
7444	 * Note, in the long run we may need another more robust solution to
7445	 * have some kind of persistant store when the vm cannot dump nor keep
7446	 * retrying as a solution, but this would be a file architectural change
7447	 */
7448	if (!nofreeupl) { /* otherwise stacked file system has to handle this */
7449		if (error) {
7450			int abortflags = 0;
7451			char action = nfs_pageouterrorhandler(error);
7452
7453			switch (action) {
7454				case DUMP:
7455					abortflags = UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY;
7456					break;
7457				case DUMPANDLOG:
7458					abortflags = UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY;
7459					if (error <= NFS_ELAST) {
7460						if ((errorcount[error] % 100) == 0)
7461							NP(np, "nfs_pageout: unexpected error %d. dumping vm page", error);
7462						errorcount[error]++;
7463					}
7464					break;
7465				case RETRY:
7466					abortflags = UPL_ABORT_FREE_ON_EMPTY;
7467					break;
7468				case SEVER: /* not implemented */
7469				default:
7470					NP(np, "nfs_pageout: action %d not expected", action);
7471					break;
7472			}
7473
7474			ubc_upl_abort_range(pl, pl_offset, pgsize, abortflags);
7475			/* return error in all cases above */
7476
7477		} else {
7478			ubc_upl_commit_range(pl, pl_offset, pgsize,
7479					     UPL_COMMIT_CLEAR_DIRTY |
7480					     UPL_COMMIT_FREE_ON_EMPTY);
7481		}
7482	}
7483	return (error);
7484}
7485
7486/* Blktooff derives file offset given a logical block number */
7487int
7488nfs_vnop_blktooff(
7489	struct vnop_blktooff_args /* {
7490		struct vnodeop_desc *a_desc;
7491		vnode_t a_vp;
7492		daddr64_t a_lblkno;
7493		off_t *a_offset;
7494	} */ *ap)
7495{
7496	int biosize;
7497	vnode_t vp = ap->a_vp;
7498	struct nfsmount *nmp = VTONMP(vp);
7499
7500	if (!nmp)
7501		return (ENXIO);
7502	biosize = nmp->nm_biosize;
7503
7504	*ap->a_offset = (off_t)(ap->a_lblkno * biosize);
7505
7506	return (0);
7507}
7508
7509int
7510nfs_vnop_offtoblk(
7511	struct vnop_offtoblk_args /* {
7512		struct vnodeop_desc *a_desc;
7513		vnode_t a_vp;
7514		off_t a_offset;
7515		daddr64_t *a_lblkno;
7516	} */ *ap)
7517{
7518	int biosize;
7519	vnode_t vp = ap->a_vp;
7520	struct nfsmount *nmp = VTONMP(vp);
7521
7522	if (!nmp)
7523		return (ENXIO);
7524	biosize = nmp->nm_biosize;
7525
7526	*ap->a_lblkno = (daddr64_t)(ap->a_offset / biosize);
7527
7528	return (0);
7529}
7530
7531/*
7532 * vnode change monitoring
7533 */
7534int
7535nfs_vnop_monitor(
7536	struct vnop_monitor_args /* {
7537		struct vnodeop_desc *a_desc;
7538		vnode_t a_vp;
7539		uint32_t a_events;
7540		uint32_t a_flags;
7541		void *a_handle;
7542		vfs_context_t a_context;
7543	} */ *ap)
7544{
7545	nfsnode_t np = VTONFS(ap->a_vp);
7546	struct nfsmount *nmp = VTONMP(ap->a_vp);
7547	int error = 0;
7548
7549	if (!nmp)
7550		return (ENXIO);
7551
7552	/* make sure that the vnode's monitoring status is up to date */
7553	lck_mtx_lock(&nmp->nm_lock);
7554	if (vnode_ismonitored(ap->a_vp)) {
7555		/* This vnode is currently being monitored, make sure we're tracking it. */
7556		if (np->n_monlink.le_next == NFSNOLIST) {
7557			LIST_INSERT_HEAD(&nmp->nm_monlist, np, n_monlink);
7558			nfs_mount_sock_thread_wake(nmp);
7559		}
7560	} else {
7561		/* This vnode is no longer being monitored, make sure we're not tracking it. */
7562		/* Wait for any in-progress getattr to complete first. */
7563		while (np->n_mflag & NMMONSCANINPROG) {
7564			struct timespec ts = { 1, 0 };
7565			np->n_mflag |= NMMONSCANWANT;
7566			msleep(&np->n_mflag, &nmp->nm_lock, PZERO-1, "nfswaitmonscan", &ts);
7567		}
7568		if (np->n_monlink.le_next != NFSNOLIST) {
7569			LIST_REMOVE(np, n_monlink);
7570			np->n_monlink.le_next = NFSNOLIST;
7571		}
7572	}
7573	lck_mtx_unlock(&nmp->nm_lock);
7574
7575	return (error);
7576}
7577
7578/*
7579 * Send a vnode notification for the given events.
7580 */
7581void
7582nfs_vnode_notify(nfsnode_t np, uint32_t events)
7583{
7584	struct nfsmount *nmp = NFSTONMP(np);
7585	struct nfs_vattr nvattr;
7586	struct vnode_attr vattr, *vap = NULL;
7587	struct timeval now;
7588
7589	microuptime(&now);
7590	if ((np->n_evtstamp == now.tv_sec) || !nmp) {
7591		/* delay sending this notify */
7592		np->n_events |= events;
7593		return;
7594	}
7595	events |= np->n_events;
7596	np->n_events = 0;
7597	np->n_evtstamp = now.tv_sec;
7598
7599	vfs_get_notify_attributes(&vattr);
7600	if (!nfs_getattrcache(np, &nvattr, 0)) {
7601		vap = &vattr;
7602		VATTR_INIT(vap);
7603		VATTR_RETURN(vap, va_fsid, vfs_statfs(nmp->nm_mountp)->f_fsid.val[0]);
7604		VATTR_RETURN(vap, va_fileid, nvattr.nva_fileid);
7605		VATTR_RETURN(vap, va_mode, nvattr.nva_mode);
7606		VATTR_RETURN(vap, va_uid, nvattr.nva_uid);
7607		VATTR_RETURN(vap, va_gid, nvattr.nva_gid);
7608		VATTR_RETURN(vap, va_nlink, nvattr.nva_nlink);
7609	}
7610	vnode_notify(NFSTOV(np), events, vap);
7611}
7612