1/*
2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 *    must display the following acknowledgement:
46 *	This product includes software developed by the University of
47 *	California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 *    may be used to endorse or promote products derived from this software
50 *    without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
66 */
67#include <sys/param.h>
68#include <sys/systm.h>
69#include <sys/resourcevar.h>
70#include <sys/signalvar.h>
71#include <sys/proc_internal.h>
72#include <sys/kauth.h>
73#include <sys/malloc.h>
74#include <sys/vnode.h>
75#include <sys/dirent.h>
76#include <sys/mount_internal.h>
77#include <sys/kernel.h>
78#include <sys/ubc_internal.h>
79#include <sys/uio_internal.h>
80#include <sys/kpi_mbuf.h>
81
82#include <sys/vm.h>
83#include <sys/vmparam.h>
84
85#include <sys/time.h>
86#include <kern/clock.h>
87#include <libkern/OSAtomic.h>
88#include <kern/kalloc.h>
89#include <kern/thread_call.h>
90
91#include <nfs/rpcv2.h>
92#include <nfs/nfsproto.h>
93#include <nfs/nfs.h>
94#include <nfs/nfs_gss.h>
95#include <nfs/nfsmount.h>
96#include <nfs/nfsnode.h>
97#include <sys/buf_internal.h>
98#include <libkern/OSAtomic.h>
99
100kern_return_t	thread_terminate(thread_t); /* XXX */
101
102#define	NFSBUFHASH(np, lbn)	\
103	(&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
104LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
105struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
106u_long nfsbufhash;
107int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
108int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
109int nfs_nbdwrite;
110int nfs_buf_timer_on = 0;
111thread_t nfsbufdelwrithd = NULL;
112
113lck_grp_t *nfs_buf_lck_grp;
114lck_mtx_t *nfs_buf_mutex;
115
116#define NFSBUF_FREE_PERIOD	30	/* seconds */
117#define NFSBUF_LRU_STALE	120
118#define NFSBUF_META_STALE	240
119
120/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
121#define LRU_TO_FREEUP			6
122/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
123#define META_TO_FREEUP			3
124/* total number of nfsbufs nfs_buf_freeup() should attempt to free */
125#define TOTAL_TO_FREEUP			(LRU_TO_FREEUP+META_TO_FREEUP)
126/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
127#define LRU_FREEUP_FRAC_ON_TIMER	8
128/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
129#define META_FREEUP_FRAC_ON_TIMER	16
130/* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
131#define LRU_FREEUP_MIN_FRAC		4
132/* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
133#define META_FREEUP_MIN_FRAC		2
134
135#define NFS_BUF_FREEUP() \
136	do { \
137		/* only call nfs_buf_freeup() if it has work to do: */ \
138		if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
139		     (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
140		    ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
141			nfs_buf_freeup(0); \
142	} while (0)
143
144/*
145 * Initialize nfsbuf lists
146 */
147void
148nfs_nbinit(void)
149{
150	nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
151	nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
152
153	nfsbufcnt = nfsbufmetacnt =
154	nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
155	nfsbufmin = 128;
156	/* size nfsbufmax to cover at most half sane_size (w/default buf size) */
157	nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
158	nfsbufmetamax = nfsbufmax / 4;
159	nfsneedbuffer = 0;
160	nfs_nbdwrite = 0;
161
162	nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
163	TAILQ_INIT(&nfsbuffree);
164	TAILQ_INIT(&nfsbuffreemeta);
165	TAILQ_INIT(&nfsbufdelwri);
166
167}
168
169/*
170 * Check periodically for stale/unused nfs bufs
171 */
172void
173nfs_buf_timer(__unused void *param0, __unused void *param1)
174{
175	nfs_buf_freeup(1);
176
177	lck_mtx_lock(nfs_buf_mutex);
178	if (nfsbufcnt <= nfsbufmin) {
179		nfs_buf_timer_on = 0;
180		lck_mtx_unlock(nfs_buf_mutex);
181		return;
182	}
183	lck_mtx_unlock(nfs_buf_mutex);
184
185	nfs_interval_timer_start(nfs_buf_timer_call,
186		NFSBUF_FREE_PERIOD * 1000);
187}
188
189/*
190 * try to free up some excess, unused nfsbufs
191 */
192void
193nfs_buf_freeup(int timer)
194{
195	struct nfsbuf *fbp;
196	struct timeval now;
197	int count;
198	struct nfsbuffreehead nfsbuffreeup;
199
200	TAILQ_INIT(&nfsbuffreeup);
201
202	lck_mtx_lock(nfs_buf_mutex);
203
204	microuptime(&now);
205
206	FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
207
208	count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
209	while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
210		fbp = TAILQ_FIRST(&nfsbuffree);
211		if (!fbp)
212			break;
213		if (fbp->nb_refs)
214			break;
215		if (NBUFSTAMPVALID(fbp) &&
216		    (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
217			break;
218		nfs_buf_remfree(fbp);
219		/* disassociate buffer from any nfsnode */
220		if (fbp->nb_np) {
221			if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
222				LIST_REMOVE(fbp, nb_vnbufs);
223				fbp->nb_vnbufs.le_next = NFSNOLIST;
224			}
225			fbp->nb_np = NULL;
226		}
227		LIST_REMOVE(fbp, nb_hash);
228		TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
229		nfsbufcnt--;
230	}
231
232	count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
233	while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
234		fbp = TAILQ_FIRST(&nfsbuffreemeta);
235		if (!fbp)
236			break;
237		if (fbp->nb_refs)
238			break;
239		if (NBUFSTAMPVALID(fbp) &&
240		    (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
241			break;
242		nfs_buf_remfree(fbp);
243		/* disassociate buffer from any nfsnode */
244		if (fbp->nb_np) {
245			if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
246				LIST_REMOVE(fbp, nb_vnbufs);
247				fbp->nb_vnbufs.le_next = NFSNOLIST;
248			}
249			fbp->nb_np = NULL;
250		}
251		LIST_REMOVE(fbp, nb_hash);
252		TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
253		nfsbufcnt--;
254		nfsbufmetacnt--;
255	}
256
257	FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
258	NFSBUFCNTCHK();
259
260	lck_mtx_unlock(nfs_buf_mutex);
261
262	while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
263		TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
264		/* nuke any creds */
265		if (IS_VALID_CRED(fbp->nb_rcred))
266			kauth_cred_unref(&fbp->nb_rcred);
267		if (IS_VALID_CRED(fbp->nb_wcred))
268			kauth_cred_unref(&fbp->nb_wcred);
269		/* if buf was NB_META, dump buffer */
270		if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
271			kfree(fbp->nb_data, fbp->nb_bufsize);
272		FREE(fbp, M_TEMP);
273	}
274
275}
276
277/*
278 * remove a buffer from the freelist
279 * (must be called with nfs_buf_mutex held)
280 */
281void
282nfs_buf_remfree(struct nfsbuf *bp)
283{
284	if (bp->nb_free.tqe_next == NFSNOLIST)
285		panic("nfsbuf not on free list");
286	if (ISSET(bp->nb_flags, NB_DELWRI)) {
287		nfsbufdelwricnt--;
288		TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
289	} else if (ISSET(bp->nb_flags, NB_META)) {
290		nfsbuffreemetacnt--;
291		TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
292	} else {
293		nfsbuffreecnt--;
294		TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
295	}
296	bp->nb_free.tqe_next = NFSNOLIST;
297	NFSBUFCNTCHK();
298}
299
300/*
301 * check for existence of nfsbuf in cache
302 */
303boolean_t
304nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
305{
306	boolean_t rv;
307	lck_mtx_lock(nfs_buf_mutex);
308	if (nfs_buf_incore(np, blkno))
309		rv = TRUE;
310	else
311		rv = FALSE;
312	lck_mtx_unlock(nfs_buf_mutex);
313	return (rv);
314}
315
316/*
317 * return incore buffer (must be called with nfs_buf_mutex held)
318 */
319struct nfsbuf *
320nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
321{
322	/* Search hash chain */
323	struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
324	for (; bp != NULL; bp = bp->nb_hash.le_next)
325		if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
326			if (!ISSET(bp->nb_flags, NB_INVAL)) {
327				FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
328				return (bp);
329			}
330		}
331	return (NULL);
332}
333
334/*
335 * Check if it's OK to drop a page.
336 *
337 * Called by vnode_pager() on pageout request of non-dirty page.
338 * We need to make sure that it's not part of a delayed write.
339 * If it is, we can't let the VM drop it because we may need it
340 * later when/if we need to write the data (again).
341 */
342int
343nfs_buf_page_inval(vnode_t vp, off_t offset)
344{
345	struct nfsmount *nmp = VTONMP(vp);
346	struct nfsbuf *bp;
347	int error = 0;
348
349	if (!nmp)
350		return (ENXIO);
351
352	lck_mtx_lock(nfs_buf_mutex);
353	bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
354	if (!bp)
355		goto out;
356	FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
357	if (ISSET(bp->nb_lflags, NBL_BUSY)) {
358		error = EBUSY;
359		goto out;
360	}
361	/*
362	 * If there's a dirty range in the buffer, check to
363	 * see if this page intersects with the dirty range.
364	 * If it does, we can't let the pager drop the page.
365	 */
366	if (bp->nb_dirtyend > 0) {
367		int start = offset - NBOFF(bp);
368		if ((bp->nb_dirtyend > start) &&
369		    (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
370			/*
371			 * Before returning the bad news, move the
372			 * buffer to the start of the delwri list and
373			 * give the list a push to try to flush the
374			 * buffer out.
375			 */
376			error = EBUSY;
377			nfs_buf_remfree(bp);
378			TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
379			nfsbufdelwricnt++;
380			nfs_buf_delwri_push(1);
381		}
382	}
383out:
384	lck_mtx_unlock(nfs_buf_mutex);
385	return (error);
386}
387
388/*
389 * set up the UPL for a buffer
390 * (must NOT be called with nfs_buf_mutex held)
391 */
392int
393nfs_buf_upl_setup(struct nfsbuf *bp)
394{
395	kern_return_t kret;
396	upl_t upl;
397	int upl_flags;
398
399	if (ISSET(bp->nb_flags, NB_PAGELIST))
400		return (0);
401
402	upl_flags = UPL_PRECIOUS;
403	if (!ISSET(bp->nb_flags, NB_READ)) {
404		/*
405		 * We're doing a "write", so we intend to modify
406		 * the pages we're gathering.
407		 */
408		upl_flags |= UPL_WILL_MODIFY;
409	}
410	kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
411				&upl, NULL, upl_flags);
412	if (kret == KERN_INVALID_ARGUMENT) {
413		/* vm object probably doesn't exist any more */
414		bp->nb_pagelist = NULL;
415		return (EINVAL);
416	}
417	if (kret != KERN_SUCCESS) {
418		printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
419		bp->nb_pagelist = NULL;
420		return (EIO);
421	}
422
423	FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
424
425	bp->nb_pagelist = upl;
426	SET(bp->nb_flags, NB_PAGELIST);
427	return (0);
428}
429
430/*
431 * update buffer's valid/dirty info from UBC
432 * (must NOT be called with nfs_buf_mutex held)
433 */
434void
435nfs_buf_upl_check(struct nfsbuf *bp)
436{
437	upl_page_info_t *pl;
438	off_t filesize, fileoffset;
439	int i, npages;
440
441	if (!ISSET(bp->nb_flags, NB_PAGELIST))
442		return;
443
444	npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
445	filesize = ubc_getsize(NFSTOV(bp->nb_np));
446	fileoffset = NBOFF(bp);
447	if (fileoffset < filesize)
448		SET(bp->nb_flags, NB_CACHE);
449	else
450		CLR(bp->nb_flags, NB_CACHE);
451
452	pl = ubc_upl_pageinfo(bp->nb_pagelist);
453	bp->nb_valid = bp->nb_dirty = 0;
454
455	for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
456		/* anything beyond the end of the file is not valid or dirty */
457		if (fileoffset >= filesize)
458			break;
459		if (!upl_valid_page(pl, i)) {
460			CLR(bp->nb_flags, NB_CACHE);
461			continue;
462		}
463		NBPGVALID_SET(bp,i);
464		if (upl_dirty_page(pl, i))
465			NBPGDIRTY_SET(bp, i);
466	}
467	fileoffset = NBOFF(bp);
468	if (ISSET(bp->nb_flags, NB_CACHE)) {
469		bp->nb_validoff = 0;
470		bp->nb_validend = bp->nb_bufsize;
471		if (fileoffset + bp->nb_validend > filesize)
472			bp->nb_validend = filesize - fileoffset;
473	} else {
474		bp->nb_validoff = bp->nb_validend = -1;
475	}
476	FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
477	FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
478}
479
480/*
481 * make sure that a buffer is mapped
482 * (must NOT be called with nfs_buf_mutex held)
483 */
484int
485nfs_buf_map(struct nfsbuf *bp)
486{
487	kern_return_t kret;
488
489	if (bp->nb_data)
490		return (0);
491	if (!ISSET(bp->nb_flags, NB_PAGELIST))
492		return (EINVAL);
493
494	kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
495	if (kret != KERN_SUCCESS)
496		panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
497	if (bp->nb_data == 0)
498		panic("ubc_upl_map mapped 0");
499	FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
500	return (0);
501}
502
503/*
504 * normalize an nfsbuf's valid range
505 *
506 * the read/write code guarantees that we'll always have a valid
507 * region that is an integral number of pages.  If either end
508 * of the valid range isn't page-aligned, it gets corrected
509 * here as we extend the valid range through all of the
510 * contiguous valid pages.
511 */
512void
513nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
514{
515	int pg, npg;
516	/* pull validoff back to start of contiguous valid page range */
517	pg = bp->nb_validoff/PAGE_SIZE;
518	while (pg >= 0 && NBPGVALID(bp,pg))
519		pg--;
520	bp->nb_validoff = (pg+1) * PAGE_SIZE;
521	/* push validend forward to end of contiguous valid page range */
522	npg = bp->nb_bufsize/PAGE_SIZE;
523	pg = bp->nb_validend/PAGE_SIZE;
524	while (pg < npg && NBPGVALID(bp,pg))
525		pg++;
526	bp->nb_validend = pg * PAGE_SIZE;
527	/* clip to EOF */
528	if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
529		bp->nb_validend = np->n_size % bp->nb_bufsize;
530}
531
532/*
533 * process some entries on the delayed write queue
534 * (must be called with nfs_buf_mutex held)
535 */
536void
537nfs_buf_delwri_service(void)
538{
539	struct nfsbuf *bp;
540	nfsnode_t np;
541	int error, i = 0;
542
543	while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
544		np = bp->nb_np;
545		nfs_buf_remfree(bp);
546		nfs_buf_refget(bp);
547		while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
548		nfs_buf_refrele(bp);
549		if (error)
550			break;
551		if (!bp->nb_np) {
552			/* buffer is no longer valid */
553			nfs_buf_drop(bp);
554			continue;
555		}
556		if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
557			nfs_buf_check_write_verifier(np, bp);
558		if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
559			/* put buffer at end of delwri list */
560			TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
561			nfsbufdelwricnt++;
562			nfs_buf_drop(bp);
563			lck_mtx_unlock(nfs_buf_mutex);
564			nfs_flushcommits(np, 1);
565		} else {
566			SET(bp->nb_flags, NB_ASYNC);
567			lck_mtx_unlock(nfs_buf_mutex);
568			nfs_buf_write(bp);
569		}
570		i++;
571		lck_mtx_lock(nfs_buf_mutex);
572	}
573}
574
575/*
576 * thread to service the delayed write queue when asked
577 */
578void
579nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
580{
581	struct timespec ts = { 30, 0 };
582	int error = 0;
583
584	lck_mtx_lock(nfs_buf_mutex);
585	while (!error) {
586		nfs_buf_delwri_service();
587		error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
588	}
589	nfsbufdelwrithd = NULL;
590	lck_mtx_unlock(nfs_buf_mutex);
591	thread_terminate(nfsbufdelwrithd);
592}
593
594/*
595 * try to push out some delayed/uncommitted writes
596 * ("locked" indicates whether nfs_buf_mutex is already held)
597 */
598void
599nfs_buf_delwri_push(int locked)
600{
601	if (TAILQ_EMPTY(&nfsbufdelwri))
602		return;
603	if (!locked)
604		lck_mtx_lock(nfs_buf_mutex);
605	/* wake up the delayed write service thread */
606	if (nfsbufdelwrithd)
607		wakeup(&nfsbufdelwrithd);
608	else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS)
609		thread_deallocate(nfsbufdelwrithd);
610	/* otherwise, try to do some of the work ourselves */
611	if (!nfsbufdelwrithd)
612		nfs_buf_delwri_service();
613	if (!locked)
614		lck_mtx_unlock(nfs_buf_mutex);
615}
616
617/*
618 * Get an nfs buffer.
619 *
620 * Returns errno on error, 0 otherwise.
621 * Any buffer is returned in *bpp.
622 *
623 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
624 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
625 *
626 * Check for existence of buffer in cache.
627 * Or attempt to reuse a buffer from one of the free lists.
628 * Or allocate a new buffer if we haven't already hit max allocation.
629 * Or wait for a free buffer.
630 *
631 * If available buffer found, prepare it, and return it.
632 *
633 * If the calling process is interrupted by a signal for
634 * an interruptible mount point, return EINTR.
635 */
636int
637nfs_buf_get(
638	nfsnode_t np,
639	daddr64_t blkno,
640	uint32_t size,
641	thread_t thd,
642	int flags,
643	struct nfsbuf **bpp)
644{
645	vnode_t vp = NFSTOV(np);
646	struct nfsmount *nmp = VTONMP(vp);
647	struct nfsbuf *bp;
648	uint32_t bufsize;
649	int slpflag = PCATCH;
650	int operation = (flags & NBLK_OPMASK);
651	int error = 0;
652	struct timespec ts;
653
654	FSDBG_TOP(541, np, blkno, size, flags);
655	*bpp = NULL;
656
657	bufsize = size;
658	if (bufsize > NFS_MAXBSIZE)
659		panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
660
661	if (!nmp) {
662		FSDBG_BOT(541, np, blkno, 0, ENXIO);
663		return (ENXIO);
664	}
665
666	if (!UBCINFOEXISTS(vp)) {
667		operation = NBLK_META;
668	} else if (bufsize < (uint32_t)nmp->nm_biosize) {
669		/* reg files should always have biosize blocks */
670		bufsize = nmp->nm_biosize;
671	}
672
673	/* if NBLK_WRITE, check for too many delayed/uncommitted writes */
674	if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
675		FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
676
677		/* poke the delwri list */
678		nfs_buf_delwri_push(0);
679
680		/* sleep to let other threads run... */
681		tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
682		FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
683	}
684
685loop:
686	lck_mtx_lock(nfs_buf_mutex);
687
688	/* wait for any buffer invalidation/flushing to complete */
689	while (np->n_bflag & NBINVALINPROG) {
690		np->n_bflag |= NBINVALWANT;
691		ts.tv_sec = 2;
692		ts.tv_nsec = 0;
693		msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
694		if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
695			lck_mtx_unlock(nfs_buf_mutex);
696			FSDBG_BOT(541, np, blkno, 0, error);
697			return (error);
698		}
699		if (np->n_bflag & NBINVALINPROG)
700			slpflag = 0;
701	}
702
703	/* check for existence of nfsbuf in cache */
704	if ((bp = nfs_buf_incore(np, blkno))) {
705		/* if busy, set wanted and wait */
706		if (ISSET(bp->nb_lflags, NBL_BUSY)) {
707			if (flags & NBLK_NOWAIT) {
708				lck_mtx_unlock(nfs_buf_mutex);
709				FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
710				return (0);
711			}
712			FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
713			SET(bp->nb_lflags, NBL_WANTED);
714
715			ts.tv_sec = 2;
716			ts.tv_nsec = 0;
717			msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
718					"nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
719			slpflag = 0;
720			FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
721			if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
722				FSDBG_BOT(541, np, blkno, 0, error);
723				return (error);
724			}
725			goto loop;
726		}
727		if (bp->nb_bufsize != bufsize)
728			panic("nfsbuf size mismatch");
729		SET(bp->nb_lflags, NBL_BUSY);
730		SET(bp->nb_flags, NB_CACHE);
731		nfs_buf_remfree(bp);
732		/* additional paranoia: */
733		if (ISSET(bp->nb_flags, NB_PAGELIST))
734			panic("pagelist buffer was not busy");
735		goto buffer_setup;
736	}
737
738	if (flags & NBLK_ONLYVALID) {
739		lck_mtx_unlock(nfs_buf_mutex);
740		FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
741		return (0);
742	}
743
744	/*
745	 * where to get a free buffer:
746	 * - if meta and maxmeta reached, must reuse meta
747	 * - alloc new if we haven't reached min bufs
748	 * - if free lists are NOT empty
749	 *   - if free list is stale, use it
750	 *   - else if freemeta list is stale, use it
751	 *   - else if max bufs allocated, use least-time-to-stale
752	 * - alloc new if we haven't reached max allowed
753	 * - start clearing out delwri list and try again
754	 */
755
756	if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
757		/* if we've hit max meta buffers, must reuse a meta buffer */
758		bp = TAILQ_FIRST(&nfsbuffreemeta);
759	} else if ((nfsbufcnt > nfsbufmin) &&
760	    (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
761		/* try to pull an nfsbuf off a free list */
762		struct nfsbuf *lrubp, *metabp;
763		struct timeval now;
764		microuptime(&now);
765
766		/* if the next LRU or META buffer is invalid or stale, use it */
767		lrubp = TAILQ_FIRST(&nfsbuffree);
768		if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
769		    ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
770			bp = lrubp;
771		metabp = TAILQ_FIRST(&nfsbuffreemeta);
772		if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
773		    ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
774			bp = metabp;
775
776		if (!bp && (nfsbufcnt >= nfsbufmax)) {
777			/* we've already allocated all bufs, so */
778			/* choose the buffer that'll go stale first */
779			if (!metabp)
780				bp = lrubp;
781			else if (!lrubp)
782				bp = metabp;
783			else {
784				int32_t lru_stale_time, meta_stale_time;
785				lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
786				meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
787				if (lru_stale_time <= meta_stale_time)
788					bp = lrubp;
789				else
790					bp = metabp;
791			}
792		}
793	}
794
795	if (bp) {
796		/* we have a buffer to reuse */
797		FSDBG(544, np, blkno, bp, bp->nb_flags);
798		nfs_buf_remfree(bp);
799		if (ISSET(bp->nb_flags, NB_DELWRI))
800			panic("nfs_buf_get: delwri");
801		SET(bp->nb_lflags, NBL_BUSY);
802		/* disassociate buffer from previous nfsnode */
803		if (bp->nb_np) {
804			if (bp->nb_vnbufs.le_next != NFSNOLIST) {
805				LIST_REMOVE(bp, nb_vnbufs);
806				bp->nb_vnbufs.le_next = NFSNOLIST;
807			}
808			bp->nb_np = NULL;
809		}
810		LIST_REMOVE(bp, nb_hash);
811		/* nuke any creds we're holding */
812		if (IS_VALID_CRED(bp->nb_rcred))
813			kauth_cred_unref(&bp->nb_rcred);
814		if (IS_VALID_CRED(bp->nb_wcred))
815			kauth_cred_unref(&bp->nb_wcred);
816		/* if buf will no longer be NB_META, dump old buffer */
817		if (operation == NBLK_META) {
818			if (!ISSET(bp->nb_flags, NB_META))
819				nfsbufmetacnt++;
820		} else if (ISSET(bp->nb_flags, NB_META)) {
821			if (bp->nb_data) {
822				kfree(bp->nb_data, bp->nb_bufsize);
823				bp->nb_data = NULL;
824			}
825			nfsbufmetacnt--;
826		}
827		/* re-init buf fields */
828		bp->nb_error = 0;
829		bp->nb_validoff = bp->nb_validend = -1;
830		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
831		bp->nb_valid = 0;
832		bp->nb_dirty = 0;
833		bp->nb_verf = 0;
834	} else {
835		/* no buffer to reuse */
836		if ((nfsbufcnt < nfsbufmax) &&
837		    ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
838			/* just alloc a new one */
839			MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
840			if (!bp) {
841				lck_mtx_unlock(nfs_buf_mutex);
842				FSDBG_BOT(541, np, blkno, 0, error);
843				return (ENOMEM);
844			}
845			nfsbufcnt++;
846
847			/*
848			 * If any excess bufs, make sure the timer
849			 * is running to free them up later.
850			 */
851			if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
852				nfs_buf_timer_on = 1;
853				nfs_interval_timer_start(nfs_buf_timer_call,
854					NFSBUF_FREE_PERIOD * 1000);
855			}
856
857			if (operation == NBLK_META)
858				nfsbufmetacnt++;
859			NFSBUFCNTCHK();
860			/* init nfsbuf */
861			bzero(bp, sizeof(*bp));
862			bp->nb_free.tqe_next = NFSNOLIST;
863			bp->nb_validoff = bp->nb_validend = -1;
864			FSDBG(545, np, blkno, bp, 0);
865		} else {
866			/* too many bufs... wait for buffers to free up */
867			FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
868
869			/* poke the delwri list */
870			nfs_buf_delwri_push(1);
871
872			nfsneedbuffer = 1;
873			msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL);
874			FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
875			if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
876				FSDBG_BOT(541, np, blkno, 0, error);
877				return (error);
878			}
879			goto loop;
880		}
881	}
882
883	/* set up nfsbuf */
884	SET(bp->nb_lflags, NBL_BUSY);
885	bp->nb_flags = 0;
886	bp->nb_lblkno = blkno;
887	/* insert buf in hash */
888	LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
889	/* associate buffer with new nfsnode */
890	bp->nb_np = np;
891	LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
892
893buffer_setup:
894
895	/* unlock hash */
896	lck_mtx_unlock(nfs_buf_mutex);
897
898	switch (operation) {
899	case NBLK_META:
900		SET(bp->nb_flags, NB_META);
901		if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
902			kfree(bp->nb_data, bp->nb_bufsize);
903			bp->nb_data = NULL;
904			bp->nb_validoff = bp->nb_validend = -1;
905			bp->nb_dirtyoff = bp->nb_dirtyend = 0;
906			bp->nb_valid = 0;
907			bp->nb_dirty = 0;
908			CLR(bp->nb_flags, NB_CACHE);
909		}
910		if (!bp->nb_data)
911			bp->nb_data = kalloc(bufsize);
912		if (!bp->nb_data) {
913			/* Ack! couldn't allocate the data buffer! */
914			/* clean up buffer and return error */
915			lck_mtx_lock(nfs_buf_mutex);
916			LIST_REMOVE(bp, nb_vnbufs);
917			bp->nb_vnbufs.le_next = NFSNOLIST;
918			bp->nb_np = NULL;
919			/* invalidate usage timestamp to allow immediate freeing */
920			NBUFSTAMPINVALIDATE(bp);
921			if (bp->nb_free.tqe_next != NFSNOLIST)
922				panic("nfsbuf on freelist");
923			TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
924			nfsbuffreecnt++;
925			lck_mtx_unlock(nfs_buf_mutex);
926			FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
927			return (ENOMEM);
928		}
929		bp->nb_bufsize = bufsize;
930		break;
931
932	case NBLK_READ:
933	case NBLK_WRITE:
934		/*
935		 * Set or clear NB_READ now to let the UPL subsystem know
936		 * if we intend to modify the pages or not.
937		 */
938		if (operation == NBLK_READ) {
939			SET(bp->nb_flags, NB_READ);
940		} else {
941			CLR(bp->nb_flags, NB_READ);
942		}
943		if (bufsize < PAGE_SIZE)
944			bufsize = PAGE_SIZE;
945		bp->nb_bufsize = bufsize;
946		bp->nb_validoff = bp->nb_validend = -1;
947
948		if (UBCINFOEXISTS(vp)) {
949			/* set up upl */
950			if (nfs_buf_upl_setup(bp)) {
951				/* unable to create upl */
952				/* vm object must no longer exist */
953				/* clean up buffer and return error */
954				lck_mtx_lock(nfs_buf_mutex);
955				LIST_REMOVE(bp, nb_vnbufs);
956				bp->nb_vnbufs.le_next = NFSNOLIST;
957				bp->nb_np = NULL;
958				/* invalidate usage timestamp to allow immediate freeing */
959				NBUFSTAMPINVALIDATE(bp);
960				if (bp->nb_free.tqe_next != NFSNOLIST)
961					panic("nfsbuf on freelist");
962				TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
963				nfsbuffreecnt++;
964				lck_mtx_unlock(nfs_buf_mutex);
965				FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
966				return (EIO);
967			}
968			nfs_buf_upl_check(bp);
969		}
970		break;
971
972	default:
973		panic("nfs_buf_get: %d unknown operation", operation);
974	}
975
976	*bpp = bp;
977
978	FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
979
980	return (0);
981}
982
983void
984nfs_buf_release(struct nfsbuf *bp, int freeup)
985{
986	nfsnode_t np = bp->nb_np;
987	vnode_t vp;
988	struct timeval now;
989	int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
990
991	FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
992	FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
993	FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
994
995	vp = np ? NFSTOV(np) : NULL;
996	if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
997		int upl_flags, rv;
998		upl_t upl;
999		uint32_t i;
1000
1001		if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
1002			rv = nfs_buf_upl_setup(bp);
1003			if (rv)
1004				printf("nfs_buf_release: upl create failed %d\n", rv);
1005			else
1006				nfs_buf_upl_check(bp);
1007		}
1008		upl = bp->nb_pagelist;
1009		if (!upl)
1010			goto pagelist_cleanup_done;
1011		if (bp->nb_data) {
1012			if (ubc_upl_unmap(upl) != KERN_SUCCESS)
1013				panic("ubc_upl_unmap failed");
1014			bp->nb_data = NULL;
1015		}
1016		/*
1017		 * Abort the pages on error or: if this is an invalid or
1018		 * non-needcommit nocache buffer AND no pages are dirty.
1019		 */
1020		if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
1021		    (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
1022			if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE)))
1023				upl_flags = UPL_ABORT_DUMP_PAGES;
1024			else
1025				upl_flags = 0;
1026			ubc_upl_abort(upl, upl_flags);
1027			goto pagelist_cleanup_done;
1028		}
1029		for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
1030			if (!NBPGVALID(bp,i))
1031				ubc_upl_abort_range(upl,
1032					i*PAGE_SIZE, PAGE_SIZE,
1033					UPL_ABORT_DUMP_PAGES |
1034					UPL_ABORT_FREE_ON_EMPTY);
1035			else {
1036				if (NBPGDIRTY(bp,i))
1037					upl_flags = UPL_COMMIT_SET_DIRTY;
1038				else
1039					upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1040
1041				if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))
1042					upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
1043
1044				ubc_upl_commit_range(upl,
1045					i*PAGE_SIZE, PAGE_SIZE,
1046					upl_flags |
1047					UPL_COMMIT_INACTIVATE |
1048					UPL_COMMIT_FREE_ON_EMPTY);
1049			}
1050		}
1051pagelist_cleanup_done:
1052		/* invalidate any pages past EOF */
1053		if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
1054			off_t start, end;
1055			start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
1056			end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1057			if (start < NBOFF(bp))
1058				start = NBOFF(bp);
1059			if (end > start) {
1060				if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE)))
1061					printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
1062			}
1063		}
1064		CLR(bp->nb_flags, NB_PAGELIST);
1065		bp->nb_pagelist = NULL;
1066	}
1067
1068	lck_mtx_lock(nfs_buf_mutex);
1069
1070	wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1071
1072	/* Wake up any processes waiting for any buffer to become free. */
1073	if (nfsneedbuffer) {
1074		nfsneedbuffer = 0;
1075		wakeup_needbuffer = 1;
1076	}
1077	/* Wake up any processes waiting for _this_ buffer to become free. */
1078	if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1079		CLR(bp->nb_lflags, NBL_WANTED);
1080		wakeup_buffer = 1;
1081	}
1082
1083	/* If it's non-needcommit nocache, or an error, mark it invalid. */
1084	if (ISSET(bp->nb_flags, NB_ERROR) ||
1085	    (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))))
1086		SET(bp->nb_flags, NB_INVAL);
1087
1088	if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1089		/* If it's invalid or empty, dissociate it from its nfsnode */
1090		if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1091			LIST_REMOVE(bp, nb_vnbufs);
1092			bp->nb_vnbufs.le_next = NFSNOLIST;
1093		}
1094		bp->nb_np = NULL;
1095		/* if this was a delayed write, wakeup anyone */
1096		/* waiting for delayed writes to complete */
1097		if (ISSET(bp->nb_flags, NB_DELWRI)) {
1098			CLR(bp->nb_flags, NB_DELWRI);
1099			nfs_nbdwrite--;
1100			NFSBUFCNTCHK();
1101			wakeup_nbdwrite = 1;
1102		}
1103		/* invalidate usage timestamp to allow immediate freeing */
1104		NBUFSTAMPINVALIDATE(bp);
1105		/* put buffer at head of free list */
1106		if (bp->nb_free.tqe_next != NFSNOLIST)
1107			panic("nfsbuf on freelist");
1108		SET(bp->nb_flags, NB_INVAL);
1109		if (ISSET(bp->nb_flags, NB_META)) {
1110			TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1111			nfsbuffreemetacnt++;
1112		} else {
1113			TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1114			nfsbuffreecnt++;
1115		}
1116	} else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1117		/* put buffer at end of delwri list */
1118		if (bp->nb_free.tqe_next != NFSNOLIST)
1119			panic("nfsbuf on freelist");
1120		TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1121		nfsbufdelwricnt++;
1122		freeup = 0;
1123	} else {
1124		/* update usage timestamp */
1125		microuptime(&now);
1126		bp->nb_timestamp = now.tv_sec;
1127		/* put buffer at end of free list */
1128		if (bp->nb_free.tqe_next != NFSNOLIST)
1129			panic("nfsbuf on freelist");
1130		if (ISSET(bp->nb_flags, NB_META)) {
1131			TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1132			nfsbuffreemetacnt++;
1133		} else {
1134			TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1135			nfsbuffreecnt++;
1136		}
1137	}
1138
1139	NFSBUFCNTCHK();
1140
1141	/* Unlock the buffer. */
1142	CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1143	CLR(bp->nb_lflags, NBL_BUSY);
1144
1145	FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1146
1147	lck_mtx_unlock(nfs_buf_mutex);
1148
1149	if (wakeup_needbuffer)
1150		wakeup(&nfsneedbuffer);
1151	if (wakeup_buffer)
1152		wakeup(bp);
1153	if (wakeup_nbdwrite)
1154		wakeup(&nfs_nbdwrite);
1155	if (freeup)
1156		NFS_BUF_FREEUP();
1157}
1158
1159/*
1160 * Wait for operations on the buffer to complete.
1161 * When they do, extract and return the I/O's error value.
1162 */
1163int
1164nfs_buf_iowait(struct nfsbuf *bp)
1165{
1166	FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1167
1168	lck_mtx_lock(nfs_buf_mutex);
1169
1170	while (!ISSET(bp->nb_flags, NB_DONE))
1171		msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
1172
1173	lck_mtx_unlock(nfs_buf_mutex);
1174
1175	FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1176
1177	/* check for interruption of I/O, then errors. */
1178	if (ISSET(bp->nb_flags, NB_EINTR)) {
1179		CLR(bp->nb_flags, NB_EINTR);
1180		return (EINTR);
1181	} else if (ISSET(bp->nb_flags, NB_ERROR))
1182		return (bp->nb_error ? bp->nb_error : EIO);
1183	return (0);
1184}
1185
1186/*
1187 * Mark I/O complete on a buffer.
1188 */
1189void
1190nfs_buf_iodone(struct nfsbuf *bp)
1191{
1192
1193	FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1194
1195	if (ISSET(bp->nb_flags, NB_DONE))
1196		panic("nfs_buf_iodone already");
1197
1198	if (!ISSET(bp->nb_flags, NB_READ)) {
1199		CLR(bp->nb_flags, NB_WRITEINPROG);
1200		/*
1201		 * vnode_writedone() takes care of waking up
1202		 * any throttled write operations
1203		 */
1204		vnode_writedone(NFSTOV(bp->nb_np));
1205		nfs_node_lock_force(bp->nb_np);
1206		bp->nb_np->n_numoutput--;
1207		nfs_node_unlock(bp->nb_np);
1208	}
1209	if (ISSET(bp->nb_flags, NB_ASYNC)) {	/* if async, release it */
1210		SET(bp->nb_flags, NB_DONE);		/* note that it's done */
1211		nfs_buf_release(bp, 1);
1212	} else {		                        /* or just wakeup the buffer */
1213	        lck_mtx_lock(nfs_buf_mutex);
1214		SET(bp->nb_flags, NB_DONE);		/* note that it's done */
1215		CLR(bp->nb_lflags, NBL_WANTED);
1216	        lck_mtx_unlock(nfs_buf_mutex);
1217		wakeup(bp);
1218	}
1219
1220	FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1221}
1222
1223void
1224nfs_buf_write_delayed(struct nfsbuf *bp)
1225{
1226	nfsnode_t np = bp->nb_np;
1227
1228	FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1229	FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1230
1231	/*
1232	 * If the block hasn't been seen before:
1233	 *	(1) Mark it as having been seen,
1234	 *	(2) Make sure it's on its node's correct block list,
1235	 */
1236	if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1237		SET(bp->nb_flags, NB_DELWRI);
1238		/* move to dirty list */
1239		lck_mtx_lock(nfs_buf_mutex);
1240		nfs_nbdwrite++;
1241		NFSBUFCNTCHK();
1242		if (bp->nb_vnbufs.le_next != NFSNOLIST)
1243			LIST_REMOVE(bp, nb_vnbufs);
1244		LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
1245		lck_mtx_unlock(nfs_buf_mutex);
1246	}
1247
1248	/*
1249	 * If the vnode has "too many" write operations in progress
1250	 * wait for them to finish the IO
1251	 */
1252	vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1253
1254	/* the file is in a modified state, so make sure the flag's set */
1255	nfs_node_lock_force(np);
1256	np->n_flag |= NMODIFIED;
1257	nfs_node_unlock(np);
1258
1259	/*
1260	 * If we have too many delayed write buffers,
1261	 * just fall back to doing the async write.
1262	 */
1263	if (nfs_nbdwrite < 0)
1264		panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1265	if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
1266		/* issue async write */
1267		SET(bp->nb_flags, NB_ASYNC);
1268		nfs_buf_write(bp);
1269		FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1270		return;
1271	}
1272
1273	/* Otherwise, the "write" is done, so mark and release the buffer. */
1274	SET(bp->nb_flags, NB_DONE);
1275	nfs_buf_release(bp, 1);
1276	FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1277	return;
1278}
1279
1280/*
1281 * Check that a "needcommit" buffer can still be committed.
1282 * If the write verifier has changed, we need to clear the
1283 * the needcommit flag.
1284 */
1285void
1286nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
1287{
1288	struct nfsmount *nmp;
1289
1290	if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
1291		return;
1292
1293	nmp = NFSTONMP(np);
1294	if (!nmp)
1295		return;
1296	if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf))
1297		return;
1298
1299	/* write verifier changed, clear commit/wverf flags */
1300	CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1301	bp->nb_verf = 0;
1302	nfs_node_lock_force(np);
1303	np->n_needcommitcnt--;
1304	CHECK_NEEDCOMMITCNT(np);
1305	nfs_node_unlock(np);
1306}
1307
1308/*
1309 * add a reference to a buffer so it doesn't disappear while being used
1310 * (must be called with nfs_buf_mutex held)
1311 */
1312void
1313nfs_buf_refget(struct nfsbuf *bp)
1314{
1315	bp->nb_refs++;
1316}
1317/*
1318 * release a reference on a buffer
1319 * (must be called with nfs_buf_mutex held)
1320 */
1321void
1322nfs_buf_refrele(struct nfsbuf *bp)
1323{
1324	bp->nb_refs--;
1325}
1326
1327/*
1328 * mark a particular buffer as BUSY
1329 * (must be called with nfs_buf_mutex held)
1330 */
1331errno_t
1332nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1333{
1334	errno_t error;
1335	struct timespec ts;
1336
1337	if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1338		/*
1339		 * since the lck_mtx_lock may block, the buffer
1340		 * may become BUSY, so we need to recheck for
1341		 * a NOWAIT request
1342		 */
1343	        if (flags & NBAC_NOWAIT)
1344			return (EBUSY);
1345	        SET(bp->nb_lflags, NBL_WANTED);
1346
1347		ts.tv_sec = (slptimeo/100);
1348		/* the hz value is 100; which leads to 10ms */
1349		ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
1350
1351		error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1352			"nfs_buf_acquire", &ts);
1353		if (error)
1354			return (error);
1355		return (EAGAIN);
1356	}
1357	if (flags & NBAC_REMOVE)
1358	        nfs_buf_remfree(bp);
1359	SET(bp->nb_lflags, NBL_BUSY);
1360
1361	return (0);
1362}
1363
1364/*
1365 * simply drop the BUSY status of a buffer
1366 * (must be called with nfs_buf_mutex held)
1367 */
1368void
1369nfs_buf_drop(struct nfsbuf *bp)
1370{
1371	int need_wakeup = 0;
1372
1373	if (!ISSET(bp->nb_lflags, NBL_BUSY))
1374		panic("nfs_buf_drop: buffer not busy!");
1375	if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1376	        /* delay the actual wakeup until after we clear NBL_BUSY */
1377		need_wakeup = 1;
1378	}
1379	/* Unlock the buffer. */
1380	CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1381
1382	if (need_wakeup)
1383	        wakeup(bp);
1384}
1385
1386/*
1387 * prepare for iterating over an nfsnode's buffer list
1388 * this lock protects the queue manipulation
1389 * (must be called with nfs_buf_mutex held)
1390 */
1391int
1392nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1393{
1394	struct nfsbuflists *listheadp;
1395
1396	if (flags & NBI_DIRTY)
1397		listheadp = &np->n_dirtyblkhd;
1398	else
1399		listheadp = &np->n_cleanblkhd;
1400
1401	if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1402	        LIST_INIT(iterheadp);
1403		return(EWOULDBLOCK);
1404	}
1405
1406	while (np->n_bufiterflags & NBI_ITER) 	{
1407	        np->n_bufiterflags |= NBI_ITERWANT;
1408		msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
1409	}
1410	if (LIST_EMPTY(listheadp)) {
1411	        LIST_INIT(iterheadp);
1412		return(EINVAL);
1413	}
1414	np->n_bufiterflags |= NBI_ITER;
1415
1416	iterheadp->lh_first = listheadp->lh_first;
1417	listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1418	LIST_INIT(listheadp);
1419
1420	return(0);
1421}
1422
1423/*
1424 * clean up after iterating over an nfsnode's buffer list
1425 * this lock protects the queue manipulation
1426 * (must be called with nfs_buf_mutex held)
1427 */
1428void
1429nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1430{
1431	struct nfsbuflists * listheadp;
1432	struct nfsbuf *bp;
1433
1434	if (flags & NBI_DIRTY)
1435		listheadp = &np->n_dirtyblkhd;
1436	else
1437		listheadp = &np->n_cleanblkhd;
1438
1439	while (!LIST_EMPTY(iterheadp)) {
1440		bp = LIST_FIRST(iterheadp);
1441		LIST_REMOVE(bp, nb_vnbufs);
1442		LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1443	}
1444
1445	np->n_bufiterflags &= ~NBI_ITER;
1446	if (np->n_bufiterflags & NBI_ITERWANT) {
1447		np->n_bufiterflags &= ~NBI_ITERWANT;
1448		wakeup(&np->n_bufiterflags);
1449	}
1450}
1451
1452
1453/*
1454 * Read an NFS buffer for a file.
1455 */
1456int
1457nfs_buf_read(struct nfsbuf *bp)
1458{
1459	int error = 0;
1460	nfsnode_t np;
1461	thread_t thd;
1462	kauth_cred_t cred;
1463
1464	np = bp->nb_np;
1465	cred = bp->nb_rcred;
1466	if (IS_VALID_CRED(cred))
1467		kauth_cred_ref(cred);
1468	thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1469
1470	/* sanity checks */
1471	if (!ISSET(bp->nb_flags, NB_READ))
1472		panic("nfs_buf_read: !NB_READ");
1473	if (ISSET(bp->nb_flags, NB_DONE))
1474		CLR(bp->nb_flags, NB_DONE);
1475
1476	NFS_BUF_MAP(bp);
1477
1478	OSAddAtomic64(1, &nfsstats.read_bios);
1479
1480	error = nfs_buf_read_rpc(bp, thd, cred);
1481	/*
1482	 * For async I/O, the callbacks will finish up the
1483	 * read.  Otherwise, the read has already been finished.
1484	 */
1485
1486	if (IS_VALID_CRED(cred))
1487		kauth_cred_unref(&cred);
1488	return (error);
1489}
1490
1491/*
1492 * finish the reading of a buffer
1493 */
1494void
1495nfs_buf_read_finish(struct nfsbuf *bp)
1496{
1497	nfsnode_t np = bp->nb_np;
1498	struct nfsmount *nmp;
1499
1500	if (!ISSET(bp->nb_flags, NB_ERROR)) {
1501		/* update valid range */
1502		bp->nb_validoff = 0;
1503		bp->nb_validend = bp->nb_endio;
1504		if (bp->nb_endio < (int)bp->nb_bufsize) {
1505			/*
1506			 * The read may be short because we have unflushed writes
1507			 * that are extending the file size and the reads hit the
1508			 * (old) EOF on the server.  So, just make sure nb_validend
1509			 * correctly tracks EOF.
1510			 * Note that the missing data should have already been zeroed
1511			 * in nfs_buf_read_rpc_finish().
1512			 */
1513			off_t boff = NBOFF(bp);
1514			if ((off_t)np->n_size >= (boff + bp->nb_bufsize))
1515				bp->nb_validend = bp->nb_bufsize;
1516			else if ((off_t)np->n_size >= boff)
1517				bp->nb_validend = np->n_size - boff;
1518			else
1519				bp->nb_validend = 0;
1520		}
1521		if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
1522		    ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL))
1523			bp->nb_validend = 0x100000000LL - NBOFF(bp);
1524		bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
1525		if (bp->nb_validend & PAGE_MASK) {
1526			/* zero-fill remainder of last page */
1527			bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
1528		}
1529	}
1530	nfs_buf_iodone(bp);
1531}
1532
1533/*
1534 * initiate the NFS READ RPC(s) for a buffer
1535 */
1536int
1537nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1538{
1539	struct nfsmount *nmp;
1540	nfsnode_t np = bp->nb_np;
1541	int error = 0, nfsvers, async;
1542	int offset, nrpcs;
1543	uint32_t nmrsize, length, len;
1544	off_t boff;
1545	struct nfsreq *req;
1546	struct nfsreq_cbinfo cb;
1547
1548	nmp = NFSTONMP(np);
1549	if (!nmp) {
1550		bp->nb_error = error = ENXIO;
1551		SET(bp->nb_flags, NB_ERROR);
1552		nfs_buf_iodone(bp);
1553		return (error);
1554	}
1555	nfsvers = nmp->nm_vers;
1556	nmrsize = nmp->nm_rsize;
1557
1558	boff = NBOFF(bp);
1559	offset = 0;
1560	length = bp->nb_bufsize;
1561
1562	if (nfsvers == NFS_VER2) {
1563		if (boff > 0xffffffffLL) {
1564			bp->nb_error = error = EFBIG;
1565			SET(bp->nb_flags, NB_ERROR);
1566			nfs_buf_iodone(bp);
1567			return (error);
1568		}
1569		if ((boff + length - 1) > 0xffffffffLL)
1570			length = 0x100000000LL - boff;
1571	}
1572
1573	/* Note: Can only do async I/O if nfsiods are configured. */
1574	async = (bp->nb_flags & NB_ASYNC);
1575	cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1576	cb.rcb_bp = bp;
1577
1578	bp->nb_offio = bp->nb_endio = 0;
1579	bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1580	if (async && (nrpcs > 1)) {
1581		SET(bp->nb_flags, NB_MULTASYNCRPC);
1582	} else {
1583		CLR(bp->nb_flags, NB_MULTASYNCRPC);
1584	}
1585
1586	while (length > 0) {
1587		if (ISSET(bp->nb_flags, NB_ERROR)) {
1588			error = bp->nb_error;
1589			break;
1590		}
1591		len = (length > nmrsize) ? nmrsize : length;
1592		cb.rcb_args[0] = offset;
1593		cb.rcb_args[1] = len;
1594		if (nmp->nm_vers >= NFS_VER4)
1595			cb.rcb_args[2] = nmp->nm_stategenid;
1596		req = NULL;
1597		error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
1598		if (error)
1599			break;
1600		offset += len;
1601		length -= len;
1602		if (async)
1603			continue;
1604		nfs_buf_read_rpc_finish(req);
1605		if (ISSET(bp->nb_flags, NB_ERROR)) {
1606			error = bp->nb_error;
1607			break;
1608		}
1609	}
1610
1611	if (length > 0) {
1612		/*
1613		 * Something bad happened while trying to send the RPC(s).
1614		 * Wait for any outstanding requests to complete.
1615		 */
1616		bp->nb_error = error;
1617		SET(bp->nb_flags, NB_ERROR);
1618		if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1619			nrpcs = (length + nmrsize - 1) / nmrsize;
1620			lck_mtx_lock(nfs_buf_mutex);
1621			bp->nb_rpcs -= nrpcs;
1622			if (bp->nb_rpcs == 0) {
1623				/* No RPCs left, so the buffer's done */
1624				lck_mtx_unlock(nfs_buf_mutex);
1625				nfs_buf_iodone(bp);
1626			} else {
1627				/* wait for the last RPC to mark it done */
1628				while (bp->nb_rpcs > 0)
1629					msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
1630						"nfs_buf_read_rpc_cancel", NULL);
1631				lck_mtx_unlock(nfs_buf_mutex);
1632			}
1633		} else {
1634			nfs_buf_iodone(bp);
1635		}
1636	}
1637
1638	return (error);
1639}
1640
1641/*
1642 * finish up an NFS READ RPC on a buffer
1643 */
1644void
1645nfs_buf_read_rpc_finish(struct nfsreq *req)
1646{
1647	struct nfsmount *nmp;
1648	size_t rlen;
1649	struct nfsreq_cbinfo cb;
1650	struct nfsbuf *bp;
1651	int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1652	void *wakeme = NULL;
1653	struct nfsreq *rreq = NULL;
1654	nfsnode_t np;
1655	thread_t thd;
1656	kauth_cred_t cred;
1657	uio_t auio;
1658	char uio_buf [ UIO_SIZEOF(1) ];
1659
1660finish:
1661	np = req->r_np;
1662	thd = req->r_thread;
1663	cred = req->r_cred;
1664	if (IS_VALID_CRED(cred))
1665		kauth_cred_ref(cred);
1666	cb = req->r_callback;
1667	bp = cb.rcb_bp;
1668	if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
1669		nfs_request_ref(req, 0);
1670
1671	nmp = NFSTONMP(np);
1672	if (!nmp) {
1673		SET(bp->nb_flags, NB_ERROR);
1674		bp->nb_error = error = ENXIO;
1675	}
1676	if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1677		/* just drop it */
1678		nfs_request_async_cancel(req);
1679		goto out;
1680	}
1681
1682	nfsvers = nmp->nm_vers;
1683	offset = cb.rcb_args[0];
1684	rlen = length = cb.rcb_args[1];
1685
1686	auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
1687                                UIO_READ, &uio_buf, sizeof(uio_buf));
1688	uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
1689
1690	/* finish the RPC */
1691	error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
1692	if ((error == EINPROGRESS) && cb.rcb_func) {
1693		/* async request restarted */
1694		if (cb.rcb_func)
1695			nfs_request_rele(req);
1696		if (IS_VALID_CRED(cred))
1697			kauth_cred_unref(&cred);
1698		return;
1699	}
1700	if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
1701		lck_mtx_lock(&nmp->nm_lock);
1702		if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
1703			NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
1704				error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid);
1705			nfs_need_recover(nmp, error);
1706		}
1707		lck_mtx_unlock(&nmp->nm_lock);
1708		if (np->n_flag & NREVOKE) {
1709			error = EIO;
1710		} else {
1711			if (error == NFSERR_GRACE) {
1712				if (cb.rcb_func) {
1713					/*
1714					 * For an async I/O request, handle a grace delay just like
1715					 * jukebox errors.  Set the resend time and queue it up.
1716					 */
1717					struct timeval now;
1718					if (req->r_nmrep.nmc_mhead) {
1719						mbuf_freem(req->r_nmrep.nmc_mhead);
1720						req->r_nmrep.nmc_mhead = NULL;
1721					}
1722					req->r_error = 0;
1723					microuptime(&now);
1724					lck_mtx_lock(&req->r_mtx);
1725					req->r_resendtime = now.tv_sec + 2;
1726					req->r_xid = 0;                 // get a new XID
1727					req->r_flags |= R_RESTART;
1728					req->r_start = 0;
1729					nfs_asyncio_resend(req);
1730					lck_mtx_unlock(&req->r_mtx);
1731					if (IS_VALID_CRED(cred))
1732						kauth_cred_unref(&cred);
1733					/* Note: nfsreq reference taken will be dropped later when finished */
1734					return;
1735				}
1736				/* otherwise, just pause a couple seconds and retry */
1737				tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
1738			}
1739			if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
1740				rlen = 0;
1741				goto readagain;
1742			}
1743		}
1744	}
1745	if (error) {
1746		SET(bp->nb_flags, NB_ERROR);
1747		bp->nb_error = error;
1748		goto out;
1749	}
1750
1751	if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen)))
1752		bp->nb_endio = offset + rlen;
1753
1754	if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1755		/* zero out the remaining data (up to EOF) */
1756		off_t rpcrem, eofrem, rem;
1757		rpcrem = (length - rlen);
1758		eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1759		rem = (rpcrem < eofrem) ? rpcrem : eofrem;
1760		if (rem > 0)
1761			bzero(bp->nb_data + offset + rlen, rem);
1762	} else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1763		/*
1764		 * short read
1765		 *
1766		 * We haven't hit EOF and we didn't get all the data
1767		 * requested, so we need to issue another read for the rest.
1768		 * (Don't bother if the buffer already hit an error.)
1769		 */
1770readagain:
1771		offset += rlen;
1772		length -= rlen;
1773		cb.rcb_args[0] = offset;
1774		cb.rcb_args[1] = length;
1775		if (nmp->nm_vers >= NFS_VER4)
1776			cb.rcb_args[2] = nmp->nm_stategenid;
1777		error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
1778		if (!error) {
1779			if (IS_VALID_CRED(cred))
1780				kauth_cred_unref(&cred);
1781			if (!cb.rcb_func) {
1782				/* if !async we'll need to wait for this RPC to finish */
1783				req = rreq;
1784				rreq = NULL;
1785				goto finish;
1786			}
1787			nfs_request_rele(req);
1788			/*
1789			 * We're done here.
1790			 * Outstanding RPC count is unchanged.
1791			 * Callback will be called when RPC is done.
1792			 */
1793			return;
1794		}
1795		SET(bp->nb_flags, NB_ERROR);
1796		bp->nb_error = error;
1797	}
1798
1799out:
1800	if (cb.rcb_func)
1801		nfs_request_rele(req);
1802	if (IS_VALID_CRED(cred))
1803		kauth_cred_unref(&cred);
1804
1805	/*
1806	 * Decrement outstanding RPC count on buffer
1807	 * and call nfs_buf_read_finish on last RPC.
1808	 *
1809	 * (Note: when there are multiple async RPCs issued for a
1810	 * buffer we need nfs_buffer_mutex to avoid problems when
1811	 * aborting a partially-initiated set of RPCs)
1812	 */
1813
1814	multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
1815	if (multasyncrpc)
1816		lck_mtx_lock(nfs_buf_mutex);
1817
1818	bp->nb_rpcs--;
1819	finished = (bp->nb_rpcs == 0);
1820
1821	if (multasyncrpc)
1822		lck_mtx_unlock(nfs_buf_mutex);
1823
1824	if (finished) {
1825		if (multasyncrpc)
1826			wakeme = &bp->nb_rpcs;
1827		nfs_buf_read_finish(bp);
1828		if (wakeme)
1829			wakeup(wakeme);
1830	}
1831}
1832
1833/*
1834 * Do buffer readahead.
1835 * Initiate async I/O to read buffers not in cache.
1836 */
1837int
1838nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1839{
1840	struct nfsmount *nmp = NFSTONMP(np);
1841	struct nfsbuf *bp;
1842	int error = 0;
1843	uint32_t nra;
1844
1845	if (!nmp)
1846		return (ENXIO);
1847	if (nmp->nm_readahead <= 0)
1848		return (0);
1849	if (*rabnp > lastrabn)
1850		return (0);
1851
1852	for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1853		/* check if block exists and is valid. */
1854		if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
1855			/* stop reading ahead if we're beyond EOF */
1856			*rabnp = lastrabn;
1857			break;
1858		}
1859		error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp);
1860		if (error)
1861			break;
1862		nfs_node_lock_force(np);
1863		np->n_lastrahead = *rabnp;
1864		nfs_node_unlock(np);
1865		if (!bp)
1866			continue;
1867		if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
1868		    !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) {
1869			CLR(bp->nb_flags, NB_CACHE);
1870			bp->nb_valid = 0;
1871			bp->nb_validoff = bp->nb_validend = -1;
1872		}
1873		if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
1874		    !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1875			SET(bp->nb_flags, (NB_READ|NB_ASYNC));
1876			if (ioflag & IO_NOCACHE)
1877				SET(bp->nb_flags, NB_NCRDAHEAD);
1878			if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
1879				kauth_cred_ref(cred);
1880				bp->nb_rcred = cred;
1881			}
1882			if ((error = nfs_buf_read(bp)))
1883				break;
1884			continue;
1885		}
1886		nfs_buf_release(bp, 1);
1887	}
1888	return (error);
1889}
1890
1891/*
1892 * NFS buffer I/O for reading files.
1893 */
1894int
1895nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
1896{
1897	vnode_t vp = NFSTOV(np);
1898	struct nfsbuf *bp = NULL;
1899	struct nfsmount *nmp = VTONMP(vp);
1900	daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
1901	off_t diff;
1902	int error = 0, n = 0, on = 0;
1903	int nfsvers, biosize, modified, readaheads = 0;
1904	thread_t thd;
1905	kauth_cred_t cred;
1906	int64_t io_resid;
1907
1908	FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
1909
1910	nfsvers = nmp->nm_vers;
1911	biosize = nmp->nm_biosize;
1912	thd = vfs_context_thread(ctx);
1913	cred = vfs_context_ucred(ctx);
1914
1915	if (vnode_vtype(vp) != VREG) {
1916		printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
1917		FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
1918		return (EINVAL);
1919	}
1920
1921	/*
1922	 * For NFS, cache consistency can only be maintained approximately.
1923	 * Although RFC1094 does not specify the criteria, the following is
1924	 * believed to be compatible with the reference port.
1925	 *
1926	 * If the file has changed since the last read RPC or you have
1927	 * written to the file, you may have lost data cache consistency
1928	 * with the server.  So, check for a change, and flush all of the
1929	 * file's data out of the cache.
1930	 * NB: This implies that cache data can be read when up to
1931	 * NFS_MAXATTRTIMO seconds out of date. If you find that you
1932	 * need current attributes, nfs_getattr() can be forced to fetch
1933	 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
1934	 */
1935
1936	if (ISSET(np->n_flag, NUPDATESIZE))
1937		nfs_data_update_size(np, 0);
1938
1939	if ((error = nfs_node_lock(np))) {
1940		FSDBG_BOT(514, np, 0xd1e0222, 0, error);
1941		return (error);
1942	}
1943
1944	if (np->n_flag & NNEEDINVALIDATE) {
1945		np->n_flag &= ~NNEEDINVALIDATE;
1946		nfs_node_unlock(np);
1947		error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
1948		if (!error)
1949			error = nfs_node_lock(np);
1950		if (error) {
1951			FSDBG_BOT(514, np, 0xd1e0322, 0, error);
1952			return (error);
1953		}
1954	}
1955
1956	modified = (np->n_flag & NMODIFIED);
1957	nfs_node_unlock(np);
1958	/* nfs_getattr() will check changed and purge caches */
1959	error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
1960	if (error) {
1961		FSDBG_BOT(514, np, 0xd1e0004, 0, error);
1962		return (error);
1963	}
1964
1965	if (uio_resid(uio) == 0) {
1966		FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
1967		return (0);
1968	}
1969	if (uio_offset(uio) < 0) {
1970		FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
1971		return (EINVAL);
1972	}
1973
1974	/*
1975	 * set up readahead - which may be limited by:
1976	 * + current request length (for IO_NOCACHE)
1977	 * + readahead setting
1978	 * + file size
1979	 */
1980	if (nmp->nm_readahead > 0) {
1981		off_t end = uio_offset(uio) + uio_resid(uio);
1982		if (end > (off_t)np->n_size)
1983			end = np->n_size;
1984		rabn = uio_offset(uio) / biosize;
1985		maxrabn = (end - 1) / biosize;
1986		nfs_node_lock_force(np);
1987		if (!(ioflag & IO_NOCACHE) &&
1988		    (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
1989			maxrabn += nmp->nm_readahead;
1990			if ((maxrabn * biosize) >= (off_t)np->n_size)
1991				maxrabn = ((off_t)np->n_size - 1)/biosize;
1992		}
1993		if (maxrabn < np->n_lastrahead)
1994			np->n_lastrahead = -1;
1995		if (rabn < np->n_lastrahead)
1996			rabn = np->n_lastrahead + 1;
1997		nfs_node_unlock(np);
1998	} else {
1999		rabn = maxrabn = 0;
2000	}
2001
2002	do {
2003
2004		nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2005		lbn = uio_offset(uio) / biosize;
2006
2007		/*
2008		 * Copy directly from any cached pages without grabbing the bufs.
2009		 * (If we are NOCACHE and we've issued readahead requests, we need
2010		 * to grab the NB_NCRDAHEAD bufs to drop them.)
2011		 */
2012		if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
2013		    ((uio->uio_segflg == UIO_USERSPACE32 ||
2014		      uio->uio_segflg == UIO_USERSPACE64 ||
2015		      uio->uio_segflg == UIO_USERSPACE))) {
2016			io_resid = uio_resid(uio);
2017			diff = np->n_size - uio_offset(uio);
2018			if (diff < io_resid)
2019				io_resid = diff;
2020			if (io_resid > 0) {
2021				int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
2022				error = cluster_copy_ubc_data(vp, uio, &count, 0);
2023				if (error) {
2024					nfs_data_unlock(np);
2025					FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
2026					return (error);
2027				}
2028			}
2029			/* count any biocache reads that we just copied directly */
2030			if (lbn != (uio_offset(uio)/biosize)) {
2031				OSAddAtomic64((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads);
2032				FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
2033			}
2034		}
2035
2036		lbn = uio_offset(uio) / biosize;
2037		on = uio_offset(uio) % biosize;
2038		nfs_node_lock_force(np);
2039		np->n_lastread = (uio_offset(uio) - 1) / biosize;
2040		nfs_node_unlock(np);
2041
2042		if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
2043			nfs_data_unlock(np);
2044			FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
2045			return (0);
2046		}
2047
2048		/* adjust readahead block number, if necessary */
2049		if (rabn < lbn)
2050			rabn = lbn;
2051		lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
2052		if (rabn <= lastrabn) { /* start readaheads */
2053			error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
2054			if (error) {
2055				nfs_data_unlock(np);
2056				FSDBG_BOT(514, np, 0xd1e000b, 1, error);
2057				return (error);
2058			}
2059			readaheads = 1;
2060		}
2061
2062		OSAddAtomic64(1, &nfsstats.biocache_reads);
2063
2064		/*
2065		 * If the block is in the cache and has the required data
2066		 * in a valid region, just copy it out.
2067		 * Otherwise, get the block and write back/read in,
2068		 * as required.
2069		 */
2070again:
2071		io_resid = uio_resid(uio);
2072		n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
2073		diff = np->n_size - uio_offset(uio);
2074		if (diff < n)
2075			n = diff;
2076
2077		error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
2078		if (error) {
2079			nfs_data_unlock(np);
2080			FSDBG_BOT(514, np, 0xd1e000c, 0, error);
2081			return (error);
2082		}
2083
2084		if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2085			/*
2086			 * IO_NOCACHE found a cached buffer.
2087			 * Flush the buffer if it's dirty.
2088			 * Invalidate the data if it wasn't just read
2089			 * in as part of a "nocache readahead".
2090			 */
2091			if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2092				/* so write the buffer out and try again */
2093				SET(bp->nb_flags, NB_NOCACHE);
2094				goto flushbuffer;
2095			}
2096			if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2097				CLR(bp->nb_flags, NB_NCRDAHEAD);
2098				SET(bp->nb_flags, NB_NOCACHE);
2099			}
2100		}
2101
2102		/* if any pages are valid... */
2103		if (bp->nb_valid) {
2104			/* ...check for any invalid pages in the read range */
2105			int pg, firstpg, lastpg, dirtypg;
2106			dirtypg = firstpg = lastpg = -1;
2107			pg = on/PAGE_SIZE;
2108			while (pg <= (on + n - 1)/PAGE_SIZE) {
2109				if (!NBPGVALID(bp,pg)) {
2110					if (firstpg < 0)
2111						firstpg = pg;
2112					lastpg = pg;
2113				} else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
2114					dirtypg = pg;
2115				pg++;
2116			}
2117
2118			/* if there are no invalid pages, we're all set */
2119			if (firstpg < 0) {
2120				if (bp->nb_validoff < 0) {
2121					/* valid range isn't set up, so */
2122					/* set it to what we know is valid */
2123					bp->nb_validoff = trunc_page(on);
2124					bp->nb_validend = round_page(on+n);
2125					nfs_buf_normalize_valid_range(np, bp);
2126				}
2127				goto buffer_ready;
2128			}
2129
2130			/* there are invalid pages in the read range */
2131			if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
2132			    (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) {
2133				/* there are also dirty page(s) (or range) in the read range, */
2134				/* so write the buffer out and try again */
2135flushbuffer:
2136				CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2137				SET(bp->nb_flags, NB_ASYNC);
2138				if (!IS_VALID_CRED(bp->nb_wcred)) {
2139					kauth_cred_ref(cred);
2140					bp->nb_wcred = cred;
2141				}
2142				error = nfs_buf_write(bp);
2143				if (error) {
2144					nfs_data_unlock(np);
2145					FSDBG_BOT(514, np, 0xd1e000d, 0, error);
2146					return (error);
2147				}
2148				goto again;
2149			}
2150			if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
2151			    (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) {
2152				/* we need to read in more than half the buffer and the */
2153				/* buffer's not dirty, so just fetch the whole buffer */
2154				bp->nb_valid = 0;
2155			} else {
2156				/* read the page range in */
2157				uio_t auio;
2158				char uio_buf[ UIO_SIZEOF(1) ];
2159
2160				NFS_BUF_MAP(bp);
2161				auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
2162						UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
2163				if (!auio) {
2164					error = ENOMEM;
2165				} else {
2166					uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
2167							((lastpg - firstpg + 1) * PAGE_SIZE));
2168					error = nfs_read_rpc(np, auio, ctx);
2169				}
2170				if (error) {
2171					if (ioflag & IO_NOCACHE)
2172						SET(bp->nb_flags, NB_NOCACHE);
2173					nfs_buf_release(bp, 1);
2174					nfs_data_unlock(np);
2175					FSDBG_BOT(514, np, 0xd1e000e, 0, error);
2176					return (error);
2177				}
2178				/* Make sure that the valid range is set to cover this read. */
2179				bp->nb_validoff = trunc_page_32(on);
2180				bp->nb_validend = round_page_32(on+n);
2181				nfs_buf_normalize_valid_range(np, bp);
2182				if (uio_resid(auio) > 0) {
2183					/* if short read, must have hit EOF, */
2184					/* so zero the rest of the range */
2185					bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
2186				}
2187				/* mark the pages (successfully read) as valid */
2188				for (pg=firstpg; pg <= lastpg; pg++)
2189					NBPGVALID_SET(bp,pg);
2190			}
2191		}
2192		/* if no pages are valid, read the whole block */
2193		if (!bp->nb_valid) {
2194			if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2195				kauth_cred_ref(cred);
2196				bp->nb_rcred = cred;
2197			}
2198			SET(bp->nb_flags, NB_READ);
2199			CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2200			error = nfs_buf_read(bp);
2201			if (ioflag & IO_NOCACHE)
2202				SET(bp->nb_flags, NB_NOCACHE);
2203			if (error) {
2204				nfs_data_unlock(np);
2205				nfs_buf_release(bp, 1);
2206				FSDBG_BOT(514, np, 0xd1e000f, 0, error);
2207				return (error);
2208			}
2209		}
2210buffer_ready:
2211		/* validate read range against valid range and clip */
2212		if (bp->nb_validend > 0) {
2213			diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
2214			if (diff < n)
2215				n = diff;
2216		}
2217		if (n > 0) {
2218			NFS_BUF_MAP(bp);
2219			error = uiomove(bp->nb_data + on, n, uio);
2220		}
2221
2222		nfs_buf_release(bp, 1);
2223		nfs_data_unlock(np);
2224		nfs_node_lock_force(np);
2225		np->n_lastread = (uio_offset(uio) - 1) / biosize;
2226		nfs_node_unlock(np);
2227	} while (error == 0 && uio_resid(uio) > 0 && n > 0);
2228	FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
2229	return (error);
2230}
2231
2232/*
2233 * limit the number of outstanding async I/O writes
2234 */
2235int
2236nfs_async_write_start(struct nfsmount *nmp)
2237{
2238	int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
2239	struct timespec ts = {1, 0};
2240
2241	if (nfs_max_async_writes <= 0)
2242		return (0);
2243	lck_mtx_lock(&nmp->nm_lock);
2244	while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
2245		if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1)))
2246			break;
2247		msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts);
2248		slpflag = 0;
2249	}
2250	if (!error)
2251		nmp->nm_asyncwrites++;
2252	lck_mtx_unlock(&nmp->nm_lock);
2253	return (error);
2254}
2255void
2256nfs_async_write_done(struct nfsmount *nmp)
2257{
2258	if (nmp->nm_asyncwrites <= 0)
2259		return;
2260	lck_mtx_lock(&nmp->nm_lock);
2261	if (nmp->nm_asyncwrites-- >= nfs_max_async_writes)
2262		wakeup(&nmp->nm_asyncwrites);
2263	lck_mtx_unlock(&nmp->nm_lock);
2264}
2265
2266/*
2267 * write (or commit) the given NFS buffer
2268 *
2269 * Commit the buffer if we can.
2270 * Write out any dirty range.
2271 * If any dirty pages remain, write them out.
2272 * Mark buffer done.
2273 *
2274 * For async requests, all the work beyond sending the initial
2275 * write RPC is handled in the RPC callback(s).
2276 */
2277int
2278nfs_buf_write(struct nfsbuf *bp)
2279{
2280	int error = 0, oldflags, async;
2281	nfsnode_t np;
2282	thread_t thd;
2283	kauth_cred_t cred;
2284	proc_t p = current_proc();
2285	int iomode, doff, dend, firstpg, lastpg;
2286	uint32_t pagemask;
2287
2288	FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
2289
2290	if (!ISSET(bp->nb_lflags, NBL_BUSY))
2291		panic("nfs_buf_write: buffer is not busy???");
2292
2293	np = bp->nb_np;
2294	async = ISSET(bp->nb_flags, NB_ASYNC);
2295	oldflags = bp->nb_flags;
2296
2297	CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
2298	if (ISSET(oldflags, NB_DELWRI)) {
2299		lck_mtx_lock(nfs_buf_mutex);
2300		nfs_nbdwrite--;
2301		NFSBUFCNTCHK();
2302		lck_mtx_unlock(nfs_buf_mutex);
2303		wakeup(&nfs_nbdwrite);
2304	}
2305
2306	/* move to clean list */
2307	if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) {
2308		lck_mtx_lock(nfs_buf_mutex);
2309		if (bp->nb_vnbufs.le_next != NFSNOLIST)
2310			LIST_REMOVE(bp, nb_vnbufs);
2311		LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2312		lck_mtx_unlock(nfs_buf_mutex);
2313	}
2314	nfs_node_lock_force(np);
2315	np->n_numoutput++;
2316	nfs_node_unlock(np);
2317	vnode_startwrite(NFSTOV(np));
2318
2319	if (p && p->p_stats)
2320		OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
2321
2322	cred = bp->nb_wcred;
2323	if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ))
2324		cred = bp->nb_rcred;  /* shouldn't really happen, but... */
2325	if (IS_VALID_CRED(cred))
2326		kauth_cred_ref(cred);
2327	thd = async ? NULL : current_thread();
2328
2329	/* We need to make sure the pages are locked before doing I/O.  */
2330	if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) {
2331		if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2332			error = nfs_buf_upl_setup(bp);
2333			if (error) {
2334				printf("nfs_buf_write: upl create failed %d\n", error);
2335				SET(bp->nb_flags, NB_ERROR);
2336				bp->nb_error = error = EIO;
2337				nfs_buf_iodone(bp);
2338				goto out;
2339			}
2340			nfs_buf_upl_check(bp);
2341		}
2342	}
2343
2344	/* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2345	if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
2346		nfs_buf_check_write_verifier(np, bp);
2347	if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2348		struct nfsmount *nmp = NFSTONMP(np);
2349		if (!nmp) {
2350			SET(bp->nb_flags, NB_ERROR);
2351			bp->nb_error = error = EIO;
2352			nfs_buf_iodone(bp);
2353			goto out;
2354		}
2355		SET(bp->nb_flags, NB_WRITEINPROG);
2356		error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
2357				bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
2358		CLR(bp->nb_flags, NB_WRITEINPROG);
2359		if (error) {
2360			if (error != NFSERR_STALEWRITEVERF) {
2361				SET(bp->nb_flags, NB_ERROR);
2362				bp->nb_error = error;
2363			}
2364			nfs_buf_iodone(bp);
2365			goto out;
2366		}
2367		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2368		CLR(bp->nb_flags, NB_NEEDCOMMIT);
2369		nfs_node_lock_force(np);
2370		np->n_needcommitcnt--;
2371		CHECK_NEEDCOMMITCNT(np);
2372		nfs_node_unlock(np);
2373	}
2374	if (!error && (bp->nb_dirtyend > 0)) {
2375		/* sanity check the dirty range */
2376		if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2377			bp->nb_dirtyend = np->n_size - NBOFF(bp);
2378			if (bp->nb_dirtyoff >= bp->nb_dirtyend)
2379				bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2380		}
2381	}
2382	if (!error && (bp->nb_dirtyend > 0)) {
2383		/* there's a dirty range that needs to be written out */
2384		NFS_BUF_MAP(bp);
2385
2386		doff = bp->nb_dirtyoff;
2387		dend = bp->nb_dirtyend;
2388
2389		/* if doff page is dirty, move doff to start of page */
2390		if (NBPGDIRTY(bp, doff / PAGE_SIZE))
2391			doff -= doff & PAGE_MASK;
2392		/* try to expand write range to include preceding dirty pages */
2393		if (!(doff & PAGE_MASK))
2394			while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE))
2395				doff -= PAGE_SIZE;
2396		/* if dend page is dirty, move dend to start of next page */
2397		if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE))
2398			dend = round_page_32(dend);
2399		/* try to expand write range to include trailing dirty pages */
2400		if (!(dend & PAGE_MASK))
2401			while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
2402				dend += PAGE_SIZE;
2403		/* make sure to keep dend clipped to EOF */
2404		if ((NBOFF(bp) + dend) > (off_t) np->n_size)
2405			dend = np->n_size - NBOFF(bp);
2406		/* calculate range of complete pages being written */
2407		firstpg = round_page_32(doff) / PAGE_SIZE;
2408		lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2409		/* calculate mask for that page range */
2410		pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2411
2412		/*
2413		 * compare page mask to nb_dirty; if there are other dirty pages
2414		 * then write FILESYNC; otherwise, write UNSTABLE if async and
2415		 * not needcommit/stable; otherwise write FILESYNC
2416		 */
2417		if (bp->nb_dirty & ~pagemask)
2418			iomode = NFS_WRITE_FILESYNC;
2419		else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC)
2420			iomode = NFS_WRITE_UNSTABLE;
2421		else
2422			iomode = NFS_WRITE_FILESYNC;
2423
2424		/* write the whole contiguous dirty range */
2425		bp->nb_offio = doff;
2426		bp->nb_endio = dend;
2427
2428		OSAddAtomic64(1, &nfsstats.write_bios);
2429
2430		SET(bp->nb_flags, NB_WRITEINPROG);
2431		error = nfs_buf_write_rpc(bp, iomode, thd, cred);
2432		/*
2433		 * For async I/O, the callbacks will finish up the
2434		 * write and push out any dirty pages.  Otherwise,
2435		 * the write has already been finished and any dirty
2436		 * pages pushed out.
2437		 */
2438	} else {
2439		if (!error && bp->nb_dirty) /* write out any dirty pages */
2440			error = nfs_buf_write_dirty_pages(bp, thd, cred);
2441		nfs_buf_iodone(bp);
2442	}
2443	/* note: bp is still valid only for !async case */
2444out:
2445	if (!async) {
2446		error = nfs_buf_iowait(bp);
2447		/* move to clean list */
2448		if (oldflags & NB_DELWRI) {
2449			lck_mtx_lock(nfs_buf_mutex);
2450			if (bp->nb_vnbufs.le_next != NFSNOLIST)
2451				LIST_REMOVE(bp, nb_vnbufs);
2452			LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2453			lck_mtx_unlock(nfs_buf_mutex);
2454		}
2455		FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2456		nfs_buf_release(bp, 1);
2457		/* check if we need to invalidate (and we can) */
2458		if ((np->n_flag & NNEEDINVALIDATE) &&
2459		    !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
2460			int invalidate = 0;
2461			nfs_node_lock_force(np);
2462			if (np->n_flag & NNEEDINVALIDATE) {
2463				invalidate = 1;
2464				np->n_flag &= ~NNEEDINVALIDATE;
2465			}
2466			nfs_node_unlock(np);
2467			if (invalidate) {
2468				/*
2469				 * There was a write error and we need to
2470				 * invalidate attrs and flush buffers in
2471				 * order to sync up with the server.
2472				 * (if this write was extending the file,
2473				 * we may no longer know the correct size)
2474				 *
2475				 * But we couldn't call vinvalbuf while holding
2476				 * the buffer busy.  So we call vinvalbuf() after
2477				 * releasing the buffer.
2478				 */
2479				nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1);
2480			}
2481		}
2482	}
2483
2484	if (IS_VALID_CRED(cred))
2485		kauth_cred_unref(&cred);
2486	return (error);
2487}
2488
2489/*
2490 * finish the writing of a buffer
2491 */
2492void
2493nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2494{
2495	nfsnode_t np = bp->nb_np;
2496	int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2497	int firstpg, lastpg;
2498	uint32_t pagemask;
2499
2500	if ((error == EINTR) || (error == ERESTART)) {
2501		CLR(bp->nb_flags, NB_ERROR);
2502		SET(bp->nb_flags, NB_EINTR);
2503	}
2504
2505	if (!error) {
2506		/* calculate range of complete pages being written */
2507		firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2508		lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2509		/* calculate mask for that page range written */
2510		pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2511		/* clear dirty bits for pages we've written */
2512		bp->nb_dirty &= ~pagemask;
2513	}
2514
2515	/* manage needcommit state */
2516	if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2517		if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2518			nfs_node_lock_force(np);
2519			np->n_needcommitcnt++;
2520			nfs_node_unlock(np);
2521			SET(bp->nb_flags, NB_NEEDCOMMIT);
2522		}
2523		/* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2524		bp->nb_dirtyoff = bp->nb_offio;
2525		bp->nb_dirtyend = bp->nb_endio;
2526	} else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2527		nfs_node_lock_force(np);
2528		np->n_needcommitcnt--;
2529		CHECK_NEEDCOMMITCNT(np);
2530		nfs_node_unlock(np);
2531		CLR(bp->nb_flags, NB_NEEDCOMMIT);
2532	}
2533
2534	CLR(bp->nb_flags, NB_WRITEINPROG);
2535
2536	/*
2537	 * For an unstable write, the buffer is still treated as dirty until
2538	 * a commit (or stable (re)write) is performed.  Buffers needing only
2539	 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2540	 *
2541	 * If the write was interrupted we set NB_EINTR.  Don't set NB_ERROR
2542	 * because that would cause the buffer to be dropped.  The buffer is
2543	 * still valid and simply needs to be written again.
2544	 */
2545	if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2546		CLR(bp->nb_flags, NB_INVAL);
2547		if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2548			SET(bp->nb_flags, NB_DELWRI);
2549			lck_mtx_lock(nfs_buf_mutex);
2550			nfs_nbdwrite++;
2551			NFSBUFCNTCHK();
2552			lck_mtx_unlock(nfs_buf_mutex);
2553		}
2554		/*
2555		 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2556		 * clean list, we have to reassign it back to the dirty one. Ugh.
2557		 */
2558		if (ISSET(bp->nb_flags, NB_ASYNC)) {
2559			/* move to dirty list */
2560			lck_mtx_lock(nfs_buf_mutex);
2561			if (bp->nb_vnbufs.le_next != NFSNOLIST)
2562				LIST_REMOVE(bp, nb_vnbufs);
2563			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2564			lck_mtx_unlock(nfs_buf_mutex);
2565		}
2566	} else {
2567		/* either there's an error or we don't need to commit */
2568		if (error) {
2569			/*
2570			 * There was a write error and we need to invalidate
2571			 * attrs and flush buffers in order to sync up with the
2572			 * server.  (if this write was extending the file, we
2573			 * may no longer know the correct size)
2574			 *
2575			 * But we can't call vinvalbuf while holding this
2576			 * buffer busy.  Set a flag to do it after releasing
2577			 * the buffer.
2578			 */
2579			nfs_node_lock_force(np);
2580			np->n_error = error;
2581			np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2582			NATTRINVALIDATE(np);
2583			nfs_node_unlock(np);
2584		}
2585		/* clear the dirty range */
2586		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2587	}
2588
2589	if (!error && bp->nb_dirty)
2590		nfs_buf_write_dirty_pages(bp, thd, cred);
2591	nfs_buf_iodone(bp);
2592}
2593
2594/*
2595 * write out any pages marked dirty in a buffer
2596 *
2597 * We do use unstable writes and follow up with a commit.
2598 * If we catch the write verifier changing we'll restart
2599 * do the writes filesync.
2600 */
2601int
2602nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2603{
2604	nfsnode_t np = bp->nb_np;
2605	struct nfsmount *nmp = NFSTONMP(np);
2606	int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2607	uint32_t dirty = bp->nb_dirty;
2608	uint64_t wverf;
2609	uio_t auio;
2610	char uio_buf [ UIO_SIZEOF(1) ];
2611
2612	if (!bp->nb_dirty)
2613		return (0);
2614
2615	/* there are pages marked dirty that need to be written out */
2616	OSAddAtomic64(1, &nfsstats.write_bios);
2617	NFS_BUF_MAP(bp);
2618	SET(bp->nb_flags, NB_WRITEINPROG);
2619	npages = bp->nb_bufsize / PAGE_SIZE;
2620	iomode = NFS_WRITE_UNSTABLE;
2621
2622	auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
2623		&uio_buf, sizeof(uio_buf));
2624
2625again:
2626	dirty = bp->nb_dirty;
2627	wverf = bp->nb_verf;
2628	commit = NFS_WRITE_FILESYNC;
2629	for (pg = 0; pg < npages; pg++) {
2630		if (!NBPGDIRTY(bp, pg))
2631			continue;
2632		count = 1;
2633		while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count))
2634			count++;
2635		/* write count pages starting with page pg */
2636		off = pg * PAGE_SIZE;
2637		len = count * PAGE_SIZE;
2638		/* clip writes to EOF */
2639		if (NBOFF(bp) + off + len > (off_t) np->n_size)
2640			len -= (NBOFF(bp) + off + len) - np->n_size;
2641		if (len > 0) {
2642			iomode2 = iomode;
2643			uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
2644			uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
2645			error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
2646			if (error)
2647				break;
2648			if (iomode2 < commit) /* Retain the lowest commitment level returned. */
2649				commit = iomode2;
2650			if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2651				/* verifier changed, redo all the writes filesync */
2652				iomode = NFS_WRITE_FILESYNC;
2653				goto again;
2654			}
2655		}
2656		/* clear dirty bits */
2657		while (count--) {
2658			dirty &= ~(1 << pg);
2659			if (count) /* leave pg on last page */
2660				pg++;
2661		}
2662	}
2663	CLR(bp->nb_flags, NB_WRITEINPROG);
2664
2665	if (!error && (commit != NFS_WRITE_FILESYNC)) {
2666		error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
2667		if (error == NFSERR_STALEWRITEVERF) {
2668			/* verifier changed, so we need to restart all the writes */
2669			iomode = NFS_WRITE_FILESYNC;
2670			goto again;
2671		}
2672	}
2673	if (!error) {
2674		bp->nb_dirty = dirty;
2675	} else {
2676		SET(bp->nb_flags, NB_ERROR);
2677		bp->nb_error = error;
2678	}
2679	return (error);
2680}
2681
2682/*
2683 * initiate the NFS WRITE RPC(s) for a buffer
2684 */
2685int
2686nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2687{
2688	struct nfsmount *nmp;
2689	nfsnode_t np = bp->nb_np;
2690	int error = 0, nfsvers, async;
2691	int offset, nrpcs;
2692	uint32_t nmwsize, length, len;
2693	struct nfsreq *req;
2694	struct nfsreq_cbinfo cb;
2695	uio_t auio;
2696	char uio_buf [ UIO_SIZEOF(1) ];
2697
2698	nmp = NFSTONMP(np);
2699	if (!nmp) {
2700		bp->nb_error = error = ENXIO;
2701		SET(bp->nb_flags, NB_ERROR);
2702		nfs_buf_iodone(bp);
2703		return (error);
2704	}
2705	nfsvers = nmp->nm_vers;
2706	nmwsize = nmp->nm_wsize;
2707
2708	offset = bp->nb_offio;
2709	length = bp->nb_endio - bp->nb_offio;
2710
2711	/* Note: Can only do async I/O if nfsiods are configured. */
2712	async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2713	bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2714	cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2715	cb.rcb_bp = bp;
2716
2717	if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2718		bp->nb_error = error = EFBIG;
2719		SET(bp->nb_flags, NB_ERROR);
2720		nfs_buf_iodone(bp);
2721		return (error);
2722	}
2723
2724	auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
2725		UIO_WRITE, &uio_buf, sizeof(uio_buf));
2726	uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2727
2728	bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2729	if (async && (nrpcs > 1)) {
2730		SET(bp->nb_flags, NB_MULTASYNCRPC);
2731	} else {
2732		CLR(bp->nb_flags, NB_MULTASYNCRPC);
2733	}
2734
2735	while (length > 0) {
2736		if (ISSET(bp->nb_flags, NB_ERROR)) {
2737			error = bp->nb_error;
2738			break;
2739		}
2740		len = (length > nmwsize) ? nmwsize : length;
2741		cb.rcb_args[0] = offset;
2742		cb.rcb_args[1] = len;
2743		if (nmp->nm_vers >= NFS_VER4)
2744			cb.rcb_args[2] = nmp->nm_stategenid;
2745		if (async && ((error = nfs_async_write_start(nmp))))
2746			break;
2747		req = NULL;
2748		error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
2749				iomode, &cb, &req);
2750		if (error) {
2751			if (async)
2752				nfs_async_write_done(nmp);
2753			break;
2754		}
2755		offset += len;
2756		length -= len;
2757		if (async)
2758			continue;
2759		nfs_buf_write_rpc_finish(req);
2760	}
2761
2762	if (length > 0) {
2763		/*
2764		 * Something bad happened while trying to send the RPCs.
2765		 * Wait for any outstanding requests to complete.
2766		 */
2767		bp->nb_error = error;
2768		SET(bp->nb_flags, NB_ERROR);
2769		if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2770			nrpcs = (length + nmwsize - 1) / nmwsize;
2771			lck_mtx_lock(nfs_buf_mutex);
2772			bp->nb_rpcs -= nrpcs;
2773			if (bp->nb_rpcs == 0) {
2774				/* No RPCs left, so the buffer's done */
2775				lck_mtx_unlock(nfs_buf_mutex);
2776				nfs_buf_write_finish(bp, thd, cred);
2777			} else {
2778				/* wait for the last RPC to mark it done */
2779				while (bp->nb_rpcs > 0)
2780					msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
2781						"nfs_buf_write_rpc_cancel", NULL);
2782				lck_mtx_unlock(nfs_buf_mutex);
2783			}
2784		} else {
2785			nfs_buf_write_finish(bp, thd, cred);
2786		}
2787		/* It may have just been an interrupt... that's OK */
2788		if (!ISSET(bp->nb_flags, NB_ERROR))
2789			error = 0;
2790	}
2791
2792	return (error);
2793}
2794
2795/*
2796 * finish up an NFS WRITE RPC on a buffer
2797 */
2798void
2799nfs_buf_write_rpc_finish(struct nfsreq *req)
2800{
2801	int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2802	int committed = NFS_WRITE_FILESYNC;
2803	uint64_t wverf = 0;
2804	size_t rlen;
2805	void *wakeme = NULL;
2806	struct nfsreq_cbinfo cb;
2807	struct nfsreq *wreq = NULL;
2808	struct nfsbuf *bp;
2809	struct nfsmount *nmp;
2810	nfsnode_t np;
2811	thread_t thd;
2812	kauth_cred_t cred;
2813	uio_t auio;
2814	char uio_buf [ UIO_SIZEOF(1) ];
2815
2816finish:
2817	np = req->r_np;
2818	thd = req->r_thread;
2819	cred = req->r_cred;
2820	if (IS_VALID_CRED(cred))
2821		kauth_cred_ref(cred);
2822	cb = req->r_callback;
2823	bp = cb.rcb_bp;
2824	if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
2825		nfs_request_ref(req, 0);
2826
2827	nmp = NFSTONMP(np);
2828	if (!nmp) {
2829		SET(bp->nb_flags, NB_ERROR);
2830		bp->nb_error = error = ENXIO;
2831	}
2832	if (error || ISSET(bp->nb_flags, NB_ERROR)) {
2833		/* just drop it */
2834		nfs_request_async_cancel(req);
2835		goto out;
2836	}
2837	nfsvers = nmp->nm_vers;
2838
2839	offset = cb.rcb_args[0];
2840	rlen = length = cb.rcb_args[1];
2841
2842	/* finish the RPC */
2843	error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
2844	if ((error == EINPROGRESS) && cb.rcb_func) {
2845		/* async request restarted */
2846		if (cb.rcb_func)
2847			nfs_request_rele(req);
2848		if (IS_VALID_CRED(cred))
2849			kauth_cred_unref(&cred);
2850		return;
2851	}
2852	if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
2853		lck_mtx_lock(&nmp->nm_lock);
2854		if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
2855			NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
2856				error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid);
2857			nfs_need_recover(nmp, error);
2858		}
2859		lck_mtx_unlock(&nmp->nm_lock);
2860		if (np->n_flag & NREVOKE) {
2861			error = EIO;
2862		} else {
2863			if (error == NFSERR_GRACE) {
2864				if (cb.rcb_func) {
2865					/*
2866					 * For an async I/O request, handle a grace delay just like
2867					 * jukebox errors.  Set the resend time and queue it up.
2868					 */
2869					struct timeval now;
2870					if (req->r_nmrep.nmc_mhead) {
2871						mbuf_freem(req->r_nmrep.nmc_mhead);
2872						req->r_nmrep.nmc_mhead = NULL;
2873					}
2874					req->r_error = 0;
2875					microuptime(&now);
2876					lck_mtx_lock(&req->r_mtx);
2877					req->r_resendtime = now.tv_sec + 2;
2878					req->r_xid = 0;                 // get a new XID
2879					req->r_flags |= R_RESTART;
2880					req->r_start = 0;
2881					nfs_asyncio_resend(req);
2882					lck_mtx_unlock(&req->r_mtx);
2883					if (IS_VALID_CRED(cred))
2884						kauth_cred_unref(&cred);
2885					/* Note: nfsreq reference taken will be dropped later when finished */
2886					return;
2887				}
2888				/* otherwise, just pause a couple seconds and retry */
2889				tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
2890			}
2891			if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
2892				rlen = 0;
2893				goto writeagain;
2894			}
2895		}
2896	}
2897	if (error) {
2898		SET(bp->nb_flags, NB_ERROR);
2899		bp->nb_error = error;
2900	}
2901	if (error || (nfsvers == NFS_VER2))
2902		goto out;
2903	if (rlen <= 0) {
2904		SET(bp->nb_flags, NB_ERROR);
2905		bp->nb_error = error = EIO;
2906		goto out;
2907	}
2908
2909	/* save lowest commit level returned */
2910	if (committed < bp->nb_commitlevel)
2911		bp->nb_commitlevel = committed;
2912
2913	/* check the write verifier */
2914	if (!bp->nb_verf) {
2915		bp->nb_verf = wverf;
2916	} else if (bp->nb_verf != wverf) {
2917		/* verifier changed, so buffer will need to be rewritten */
2918		bp->nb_flags |= NB_STALEWVERF;
2919		bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
2920		bp->nb_verf = wverf;
2921	}
2922
2923	/*
2924	 * check for a short write
2925	 *
2926	 * If the server didn't write all the data, then we
2927	 * need to issue another write for the rest of it.
2928	 * (Don't bother if the buffer hit an error or stale wverf.)
2929	 */
2930	if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) {
2931writeagain:
2932		offset += rlen;
2933		length -= rlen;
2934
2935		auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
2936			UIO_WRITE, &uio_buf, sizeof(uio_buf));
2937		uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2938
2939		cb.rcb_args[0] = offset;
2940		cb.rcb_args[1] = length;
2941		if (nmp->nm_vers >= NFS_VER4)
2942			cb.rcb_args[2] = nmp->nm_stategenid;
2943
2944		// XXX iomode should really match the original request
2945		error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
2946				NFS_WRITE_FILESYNC, &cb, &wreq);
2947		if (!error) {
2948			if (IS_VALID_CRED(cred))
2949				kauth_cred_unref(&cred);
2950			if (!cb.rcb_func) {
2951				/* if !async we'll need to wait for this RPC to finish */
2952				req = wreq;
2953				wreq = NULL;
2954				goto finish;
2955			}
2956			nfs_request_rele(req);
2957			/*
2958			 * We're done here.
2959			 * Outstanding RPC count is unchanged.
2960			 * Callback will be called when RPC is done.
2961			 */
2962			return;
2963		}
2964		SET(bp->nb_flags, NB_ERROR);
2965		bp->nb_error = error;
2966	}
2967
2968out:
2969	if (cb.rcb_func) {
2970		nfs_async_write_done(nmp);
2971		nfs_request_rele(req);
2972	}
2973	/*
2974	 * Decrement outstanding RPC count on buffer
2975	 * and call nfs_buf_write_finish on last RPC.
2976	 *
2977	 * (Note: when there are multiple async RPCs issued for a
2978	 * buffer we need nfs_buffer_mutex to avoid problems when
2979	 * aborting a partially-initiated set of RPCs)
2980	 */
2981	multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
2982	if (multasyncrpc)
2983		lck_mtx_lock(nfs_buf_mutex);
2984
2985	bp->nb_rpcs--;
2986	finished = (bp->nb_rpcs == 0);
2987
2988	if (multasyncrpc)
2989		lck_mtx_unlock(nfs_buf_mutex);
2990
2991	if (finished) {
2992		if (multasyncrpc)
2993			wakeme = &bp->nb_rpcs;
2994		nfs_buf_write_finish(bp, thd, cred);
2995		if (wakeme)
2996			wakeup(wakeme);
2997	}
2998
2999	if (IS_VALID_CRED(cred))
3000		kauth_cred_unref(&cred);
3001}
3002
3003/*
3004 * Send commit(s) for the given node's "needcommit" buffers
3005 */
3006int
3007nfs_flushcommits(nfsnode_t np, int nowait)
3008{
3009	struct nfsmount *nmp;
3010	struct nfsbuf *bp, *prevlbp, *lbp;
3011	struct nfsbuflists blist, commitlist;
3012	int error = 0, retv, wcred_set, flags, dirty;
3013	u_quad_t off, endoff, toff;
3014	uint64_t wverf;
3015	u_int32_t count;
3016	kauth_cred_t wcred = NULL;
3017
3018	FSDBG_TOP(557, np, 0, 0, 0);
3019
3020	/*
3021	 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3022	 * server, but nas not been committed to stable storage on the server
3023	 * yet. The byte range is worked out for as many nfsbufs as we can handle
3024	 * and the commit rpc is done.
3025	 */
3026	if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3027		error = nfs_node_lock(np);
3028		if (error)
3029			goto done;
3030		np->n_flag |= NMODIFIED;
3031		nfs_node_unlock(np);
3032	}
3033
3034	off = (u_quad_t)-1;
3035	endoff = 0;
3036	wcred_set = 0;
3037	LIST_INIT(&commitlist);
3038
3039	nmp = NFSTONMP(np);
3040	if (!nmp) {
3041		error = ENXIO;
3042		goto done;
3043	}
3044	if (nmp->nm_vers == NFS_VER2) {
3045		error = EINVAL;
3046		goto done;
3047	}
3048
3049	flags = NBI_DIRTY;
3050	if (nowait)
3051		flags |= NBI_NOWAIT;
3052	lck_mtx_lock(nfs_buf_mutex);
3053	wverf = nmp->nm_verf;
3054	if (!nfs_buf_iterprepare(np, &blist, flags)) {
3055		while ((bp = LIST_FIRST(&blist))) {
3056			LIST_REMOVE(bp, nb_vnbufs);
3057			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3058			error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
3059			if (error)
3060				continue;
3061			if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3062				nfs_buf_check_write_verifier(np, bp);
3063			if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
3064			    (bp->nb_verf != wverf)) {
3065				nfs_buf_drop(bp);
3066				continue;
3067			}
3068			nfs_buf_remfree(bp);
3069
3070			/* buffer UPLs will be grabbed *in order* below */
3071
3072			FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
3073			FSDBG(557, bp->nb_validoff, bp->nb_validend,
3074			      bp->nb_dirtyoff, bp->nb_dirtyend);
3075
3076			/*
3077			 * Work out if all buffers are using the same cred
3078			 * so we can deal with them all with one commit.
3079			 *
3080			 * Note: creds in bp's must be obtained by kauth_cred_ref
3081			 * on the same original cred in order for them to be equal.
3082			 */
3083			if (wcred_set == 0) {
3084				wcred = bp->nb_wcred;
3085				if (!IS_VALID_CRED(wcred))
3086					panic("nfs: needcommit w/out wcred");
3087				wcred_set = 1;
3088			} else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
3089				wcred_set = -1;
3090			}
3091			SET(bp->nb_flags, NB_WRITEINPROG);
3092
3093			/*
3094			 * Add this buffer to the list of buffers we are committing.
3095			 * Buffers are inserted into the list in ascending order so that
3096			 * we can take the UPLs in order after the list is complete.
3097			 */
3098			prevlbp = NULL;
3099			LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
3100				if (bp->nb_lblkno < lbp->nb_lblkno)
3101					break;
3102				prevlbp = lbp;
3103			}
3104			LIST_REMOVE(bp, nb_vnbufs);
3105			if (prevlbp)
3106				LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
3107			else
3108				LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
3109
3110			/* update commit range start, end */
3111			toff = NBOFF(bp) + bp->nb_dirtyoff;
3112			if (toff < off)
3113				off = toff;
3114			toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
3115			if (toff > endoff)
3116				endoff = toff;
3117		}
3118		nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3119	}
3120	lck_mtx_unlock(nfs_buf_mutex);
3121
3122	if (LIST_EMPTY(&commitlist)) {
3123		error = ENOBUFS;
3124		goto done;
3125	}
3126
3127	/*
3128	 * We need a UPL to prevent others from accessing the buffers during
3129	 * our commit RPC(s).
3130	 *
3131	 * We used to also check for dirty pages here; if there were any we'd
3132	 * abort the commit and force the entire buffer to be written again.
3133	 * Instead of doing that, we just go ahead and commit the dirty range,
3134	 * and then leave the buffer around with dirty pages that will be
3135	 * written out later.
3136	 */
3137	LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3138		if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3139			retv = nfs_buf_upl_setup(bp);
3140			if (retv) {
3141				/* Unable to create the UPL, the VM object probably no longer exists. */
3142				printf("nfs_flushcommits: upl create failed %d\n", retv);
3143				bp->nb_valid = bp->nb_dirty = 0;
3144			}
3145		}
3146		nfs_buf_upl_check(bp);
3147	}
3148
3149	/*
3150	 * Commit data on the server, as required.
3151	 * If all bufs are using the same wcred, then use that with
3152	 * one call for all of them, otherwise commit each one
3153	 * separately.
3154	 */
3155	if (wcred_set == 1) {
3156		/*
3157		 * Note, it's possible the commit range could be >2^32-1.
3158		 * If it is, we'll send one commit that covers the whole file.
3159		 */
3160		if ((endoff - off) > 0xffffffff)
3161			count = 0;
3162		else
3163			count = (endoff - off);
3164		retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
3165	} else {
3166		retv = 0;
3167		LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3168			toff = NBOFF(bp) + bp->nb_dirtyoff;
3169			count = bp->nb_dirtyend - bp->nb_dirtyoff;
3170			retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
3171			if (retv)
3172				break;
3173		}
3174	}
3175
3176	/*
3177	 * Now, either mark the blocks I/O done or mark the
3178	 * blocks dirty, depending on whether the commit
3179	 * succeeded.
3180	 */
3181	while ((bp = LIST_FIRST(&commitlist))) {
3182		LIST_REMOVE(bp, nb_vnbufs);
3183		FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
3184		nfs_node_lock_force(np);
3185		CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3186		np->n_needcommitcnt--;
3187		CHECK_NEEDCOMMITCNT(np);
3188		nfs_node_unlock(np);
3189
3190		if (retv) {
3191			/* move back to dirty list */
3192			lck_mtx_lock(nfs_buf_mutex);
3193			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3194			lck_mtx_unlock(nfs_buf_mutex);
3195			nfs_buf_release(bp, 1);
3196			continue;
3197		}
3198
3199		nfs_node_lock_force(np);
3200		np->n_numoutput++;
3201		nfs_node_unlock(np);
3202		vnode_startwrite(NFSTOV(np));
3203		if (ISSET(bp->nb_flags, NB_DELWRI)) {
3204			lck_mtx_lock(nfs_buf_mutex);
3205			nfs_nbdwrite--;
3206			NFSBUFCNTCHK();
3207			lck_mtx_unlock(nfs_buf_mutex);
3208			wakeup(&nfs_nbdwrite);
3209		}
3210		CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
3211		/* if block still has dirty pages, we don't want it to */
3212		/* be released in nfs_buf_iodone().  So, don't set NB_ASYNC. */
3213		if (!(dirty = bp->nb_dirty))
3214			SET(bp->nb_flags, NB_ASYNC);
3215		else
3216			CLR(bp->nb_flags, NB_ASYNC);
3217
3218		/* move to clean list */
3219		lck_mtx_lock(nfs_buf_mutex);
3220		LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3221		lck_mtx_unlock(nfs_buf_mutex);
3222
3223		bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3224
3225		nfs_buf_iodone(bp);
3226		if (dirty) {
3227			/* throw it back in as a delayed write buffer */
3228			CLR(bp->nb_flags, NB_DONE);
3229			nfs_buf_write_delayed(bp);
3230		}
3231	}
3232
3233done:
3234	FSDBG_BOT(557, np, 0, 0, error);
3235	return (error);
3236}
3237
3238/*
3239 * Flush all the blocks associated with a vnode.
3240 * 	Walk through the buffer pool and push any dirty pages
3241 *	associated with the vnode.
3242 */
3243int
3244nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3245{
3246	struct nfsbuf *bp;
3247	struct nfsbuflists blist;
3248	struct nfsmount *nmp = NFSTONMP(np);
3249	int error = 0, error2, slptimeo = 0, slpflag = 0;
3250	int nfsvers, flags, passone = 1;
3251
3252	FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3253
3254	if (!nmp) {
3255		error = ENXIO;
3256		goto out;
3257	}
3258	nfsvers = nmp->nm_vers;
3259	if (NMFLAG(nmp, INTR))
3260		slpflag = PCATCH;
3261
3262	if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3263		nfs_node_lock_force(np);
3264		np->n_flag |= NMODIFIED;
3265		nfs_node_unlock(np);
3266	}
3267
3268	lck_mtx_lock(nfs_buf_mutex);
3269	while (np->n_bflag & NBFLUSHINPROG) {
3270		np->n_bflag |= NBFLUSHWANT;
3271		error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3272		if ((error && (error != EWOULDBLOCK)) ||
3273		    ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
3274			lck_mtx_unlock(nfs_buf_mutex);
3275			goto out;
3276		}
3277	}
3278	np->n_bflag |= NBFLUSHINPROG;
3279
3280	/*
3281	 * On the first pass, start async/unstable writes on all
3282	 * delayed write buffers.  Then wait for all writes to complete
3283	 * and call nfs_flushcommits() to commit any uncommitted buffers.
3284	 * On all subsequent passes, start STABLE writes on any remaining
3285	 * dirty buffers.  Then wait for all writes to complete.
3286	 */
3287again:
3288	FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3289	if (!NFSTONMP(np)) {
3290		lck_mtx_unlock(nfs_buf_mutex);
3291		error = ENXIO;
3292		goto done;
3293	}
3294
3295	/* Start/do any write(s) that are required. */
3296	if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3297		while ((bp = LIST_FIRST(&blist))) {
3298			LIST_REMOVE(bp, nb_vnbufs);
3299			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3300			flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
3301			if (flags != NBAC_NOWAIT)
3302				nfs_buf_refget(bp);
3303			while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3304				FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
3305				if (error == EBUSY)
3306					break;
3307				if (error) {
3308					error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3309					if (error2) {
3310						if (flags != NBAC_NOWAIT)
3311							nfs_buf_refrele(bp);
3312						nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3313						lck_mtx_unlock(nfs_buf_mutex);
3314						error = error2;
3315						goto done;
3316					}
3317					if (slpflag == PCATCH) {
3318						slpflag = 0;
3319						slptimeo = 2 * hz;
3320					}
3321				}
3322			}
3323			if (flags != NBAC_NOWAIT)
3324				nfs_buf_refrele(bp);
3325			if (error == EBUSY)
3326				continue;
3327			if (!bp->nb_np) {
3328				/* buffer is no longer valid */
3329				nfs_buf_drop(bp);
3330				continue;
3331			}
3332			if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3333				nfs_buf_check_write_verifier(np, bp);
3334			if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3335				/* buffer is no longer dirty */
3336				nfs_buf_drop(bp);
3337				continue;
3338			}
3339			FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
3340			if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
3341			    ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3342				nfs_buf_drop(bp);
3343				continue;
3344			}
3345			nfs_buf_remfree(bp);
3346			lck_mtx_unlock(nfs_buf_mutex);
3347			if (ISSET(bp->nb_flags, NB_ERROR)) {
3348				nfs_node_lock_force(np);
3349				np->n_error = bp->nb_error ? bp->nb_error : EIO;
3350				np->n_flag |= NWRITEERR;
3351				nfs_node_unlock(np);
3352				nfs_buf_release(bp, 1);
3353				lck_mtx_lock(nfs_buf_mutex);
3354				continue;
3355			}
3356			SET(bp->nb_flags, NB_ASYNC);
3357			if (!passone) {
3358				/* NB_STABLE forces this to be written FILESYNC */
3359				SET(bp->nb_flags, NB_STABLE);
3360			}
3361			nfs_buf_write(bp);
3362			lck_mtx_lock(nfs_buf_mutex);
3363		}
3364		nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3365	}
3366	lck_mtx_unlock(nfs_buf_mutex);
3367
3368	if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3369	        while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3370		        error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3371			if (error2) {
3372			        error = error2;
3373				goto done;
3374			}
3375			if (slpflag == PCATCH) {
3376				slpflag = 0;
3377				slptimeo = 2 * hz;
3378			}
3379		}
3380	}
3381
3382	if (nfsvers != NFS_VER2) {
3383		/* loop while it looks like there are still buffers to be */
3384		/* commited and nfs_flushcommits() seems to be handling them. */
3385		while (np->n_needcommitcnt)
3386			if (nfs_flushcommits(np, 0))
3387				break;
3388	}
3389
3390	if (passone) {
3391		passone = 0;
3392		if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3393			nfs_node_lock_force(np);
3394			np->n_flag |= NMODIFIED;
3395			nfs_node_unlock(np);
3396		}
3397		lck_mtx_lock(nfs_buf_mutex);
3398		goto again;
3399	}
3400
3401	if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3402		if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3403			nfs_node_lock_force(np);
3404			np->n_flag |= NMODIFIED;
3405			nfs_node_unlock(np);
3406		}
3407		lck_mtx_lock(nfs_buf_mutex);
3408		if (!LIST_EMPTY(&np->n_dirtyblkhd))
3409			goto again;
3410		lck_mtx_unlock(nfs_buf_mutex);
3411		nfs_node_lock_force(np);
3412		/*
3413		 * OK, it looks like there are no dirty blocks.  If we have no
3414		 * writes in flight and no one in the write code, we can clear
3415		 * the modified flag.  In order to make sure we see the latest
3416		 * attributes and size, we also invalidate the attributes and
3417		 * advance the attribute cache XID to guarantee that attributes
3418		 * newer than our clearing of NMODIFIED will get loaded next.
3419		 * (If we don't do this, it's possible for the flush's final
3420		 * write/commit (xid1) to be executed in parallel with a subsequent
3421		 * getattr request (xid2).  The getattr could return attributes
3422		 * from *before* the write/commit completed but the stale attributes
3423		 * would be preferred because of the xid ordering.)
3424		 */
3425		if (!np->n_wrbusy && !np->n_numoutput) {
3426			np->n_flag &= ~NMODIFIED;
3427			NATTRINVALIDATE(np);
3428			nfs_get_xid(&np->n_xid);
3429		}
3430	} else {
3431		nfs_node_lock_force(np);
3432	}
3433
3434	FSDBG(526, np->n_flag, np->n_error, 0, 0);
3435	if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3436		error = np->n_error;
3437		np->n_flag &= ~NWRITEERR;
3438	}
3439	nfs_node_unlock(np);
3440done:
3441	lck_mtx_lock(nfs_buf_mutex);
3442	flags = np->n_bflag;
3443	np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT);
3444	lck_mtx_unlock(nfs_buf_mutex);
3445	if (flags & NBFLUSHWANT)
3446		wakeup(&np->n_bflag);
3447out:
3448	FSDBG_BOT(517, np, error, ignore_writeerr, 0);
3449	return (error);
3450}
3451
3452/*
3453 * Flush out and invalidate all buffers associated with a vnode.
3454 * Called with the underlying object locked.
3455 */
3456int
3457nfs_vinvalbuf_internal(
3458	nfsnode_t np,
3459	int flags,
3460	thread_t thd,
3461	kauth_cred_t cred,
3462	int slpflag,
3463	int slptimeo)
3464{
3465	struct nfsbuf *bp;
3466	struct nfsbuflists blist;
3467	int list, error = 0;
3468
3469	if (flags & V_SAVE) {
3470		if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR))))
3471			return (error);
3472	}
3473
3474	lck_mtx_lock(nfs_buf_mutex);
3475	for (;;) {
3476		list = NBI_CLEAN;
3477		if (nfs_buf_iterprepare(np, &blist, list)) {
3478			list = NBI_DIRTY;
3479			if (nfs_buf_iterprepare(np, &blist, list))
3480				break;
3481		}
3482		while ((bp = LIST_FIRST(&blist))) {
3483			LIST_REMOVE(bp, nb_vnbufs);
3484			if (list == NBI_CLEAN)
3485				LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3486			else
3487				LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3488			nfs_buf_refget(bp);
3489			while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
3490				FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
3491				if (error != EAGAIN) {
3492					FSDBG(554, np, bp, -1, error);
3493					nfs_buf_refrele(bp);
3494					nfs_buf_itercomplete(np, &blist, list);
3495					lck_mtx_unlock(nfs_buf_mutex);
3496					return (error);
3497				}
3498			}
3499			nfs_buf_refrele(bp);
3500			FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
3501			lck_mtx_unlock(nfs_buf_mutex);
3502			if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
3503			    (NBOFF(bp) < (off_t)np->n_size)) {
3504				/* extra paranoia: make sure we're not */
3505				/* somehow leaving any dirty data around */
3506				int mustwrite = 0;
3507				int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3508				    ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
3509				if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3510					error = nfs_buf_upl_setup(bp);
3511					if (error == EINVAL) {
3512						/* vm object must no longer exist */
3513						/* hopefully we don't need to do */
3514						/* anything for this buffer */
3515					} else if (error)
3516						printf("nfs_vinvalbuf: upl setup failed %d\n", error);
3517					bp->nb_valid = bp->nb_dirty = 0;
3518				}
3519				nfs_buf_upl_check(bp);
3520				/* check for any dirty data before the EOF */
3521				if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3522					/* clip dirty range to EOF */
3523					if (bp->nb_dirtyend > end) {
3524						bp->nb_dirtyend = end;
3525						if (bp->nb_dirtyoff >= bp->nb_dirtyend)
3526							bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3527					}
3528					if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end))
3529						mustwrite++;
3530				}
3531				bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
3532				if (bp->nb_dirty)
3533					mustwrite++;
3534				/* also make sure we'll have a credential to do the write */
3535				if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
3536					printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3537					mustwrite = 0;
3538				}
3539				if (mustwrite) {
3540					FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
3541					if (!ISSET(bp->nb_flags, NB_PAGELIST))
3542						panic("nfs_vinvalbuf: dirty buffer without upl");
3543					/* gotta write out dirty data before invalidating */
3544					/* (NB_STABLE indicates that data writes should be FILESYNC) */
3545					/* (NB_NOCACHE indicates buffer should be discarded) */
3546					CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3547					SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
3548					if (!IS_VALID_CRED(bp->nb_wcred)) {
3549						kauth_cred_ref(cred);
3550						bp->nb_wcred = cred;
3551					}
3552					error = nfs_buf_write(bp);
3553					// Note: bp has been released
3554					if (error) {
3555						FSDBG(554, bp, 0xd00dee, 0xbad, error);
3556						nfs_node_lock_force(np);
3557						if ((error != EINTR) && (error != ERESTART)) {
3558							np->n_error = error;
3559							np->n_flag |= NWRITEERR;
3560						}
3561						/*
3562						 * There was a write error and we need to
3563						 * invalidate attrs to sync with server.
3564						 * (if this write was extending the file,
3565						 * we may no longer know the correct size)
3566						 */
3567						NATTRINVALIDATE(np);
3568						nfs_node_unlock(np);
3569						if ((error == EINTR) || (error == ERESTART)) {
3570							/*
3571							 * Abort on EINTR.  If we don't, we could
3572							 * be stuck in this loop forever because
3573							 * the buffer will continue to stay dirty.
3574							 */
3575							lck_mtx_lock(nfs_buf_mutex);
3576							nfs_buf_itercomplete(np, &blist, list);
3577							lck_mtx_unlock(nfs_buf_mutex);
3578							return (error);
3579						}
3580						error = 0;
3581					}
3582					lck_mtx_lock(nfs_buf_mutex);
3583					continue;
3584				}
3585			}
3586			SET(bp->nb_flags, NB_INVAL);
3587			// hold off on FREEUPs until we're done here
3588			nfs_buf_release(bp, 0);
3589			lck_mtx_lock(nfs_buf_mutex);
3590		}
3591		nfs_buf_itercomplete(np, &blist, list);
3592	}
3593	if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd))
3594		panic("nfs_vinvalbuf: flush/inval failed");
3595	lck_mtx_unlock(nfs_buf_mutex);
3596	nfs_node_lock_force(np);
3597	if (!(flags & V_SAVE))
3598		np->n_flag &= ~NMODIFIED;
3599	if (vnode_vtype(NFSTOV(np)) == VREG)
3600		np->n_lastrahead = -1;
3601	nfs_node_unlock(np);
3602	NFS_BUF_FREEUP();
3603	return (0);
3604}
3605
3606
3607/*
3608 * Flush and invalidate all dirty buffers. If another process is already
3609 * doing the flush, just wait for completion.
3610 */
3611int
3612nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3613{
3614	return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3615}
3616
3617int
3618nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
3619{
3620	nfsnode_t np = VTONFS(vp);
3621	struct nfsmount *nmp = VTONMP(vp);
3622	int error, slpflag, slptimeo, nflags, retry = 0;
3623	struct timespec ts = { 2, 0 };
3624	off_t size;
3625
3626	FSDBG_TOP(554, np, flags, intrflg, 0);
3627
3628	if (nmp && !NMFLAG(nmp, INTR))
3629		intrflg = 0;
3630	if (intrflg) {
3631		slpflag = PCATCH;
3632		slptimeo = 2 * hz;
3633	} else {
3634		slpflag = 0;
3635		slptimeo = 0;
3636	}
3637
3638	/* First wait for any other process doing a flush to complete.  */
3639	lck_mtx_lock(nfs_buf_mutex);
3640	while (np->n_bflag & NBINVALINPROG) {
3641		np->n_bflag |= NBINVALWANT;
3642		msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
3643		if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3644			lck_mtx_unlock(nfs_buf_mutex);
3645			return (error);
3646		}
3647		if (np->n_bflag & NBINVALINPROG)
3648			slpflag = 0;
3649	}
3650	np->n_bflag |= NBINVALINPROG;
3651	lck_mtx_unlock(nfs_buf_mutex);
3652
3653	/* Now, flush as required.  */
3654again:
3655	error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3656	while (error) {
3657		FSDBG(554, np, 0, 0, error);
3658		if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0)))
3659			goto done;
3660		error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
3661	}
3662
3663	/* get the pages out of vm also */
3664	if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp)))
3665		if ((error = ubc_msync(vp, 0, size, NULL, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE))) {
3666			if (error == EINVAL)
3667				panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
3668			if (retry++ < 10) /* retry invalidating a few times */
3669				goto again;
3670			/* give up */
3671			printf("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
3672
3673		}
3674done:
3675	lck_mtx_lock(nfs_buf_mutex);
3676	nflags = np->n_bflag;
3677	np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT);
3678	lck_mtx_unlock(nfs_buf_mutex);
3679	if (nflags & NBINVALWANT)
3680		wakeup(&np->n_bflag);
3681
3682	FSDBG_BOT(554, np, flags, intrflg, error);
3683	return (error);
3684}
3685
3686/*
3687 * Wait for any busy buffers to complete.
3688 */
3689void
3690nfs_wait_bufs(nfsnode_t np)
3691{
3692	struct nfsbuf *bp;
3693	struct nfsbuflists blist;
3694	int error = 0;
3695
3696	lck_mtx_lock(nfs_buf_mutex);
3697	if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
3698		while ((bp = LIST_FIRST(&blist))) {
3699			LIST_REMOVE(bp, nb_vnbufs);
3700			LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3701			nfs_buf_refget(bp);
3702			while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3703				if (error != EAGAIN) {
3704					nfs_buf_refrele(bp);
3705					nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3706					lck_mtx_unlock(nfs_buf_mutex);
3707					return;
3708				}
3709			}
3710			nfs_buf_refrele(bp);
3711			nfs_buf_drop(bp);
3712		}
3713		nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3714	}
3715	if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3716		while ((bp = LIST_FIRST(&blist))) {
3717			LIST_REMOVE(bp, nb_vnbufs);
3718			LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3719			nfs_buf_refget(bp);
3720			while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3721				if (error != EAGAIN) {
3722					nfs_buf_refrele(bp);
3723					nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3724					lck_mtx_unlock(nfs_buf_mutex);
3725					return;
3726				}
3727			}
3728			nfs_buf_refrele(bp);
3729			nfs_buf_drop(bp);
3730		}
3731		nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3732	}
3733	lck_mtx_unlock(nfs_buf_mutex);
3734}
3735
3736
3737/*
3738 * Add an async I/O request to the mount's async I/O queue and make
3739 * sure that an nfsiod will service it.
3740 */
3741void
3742nfs_asyncio_finish(struct nfsreq *req)
3743{
3744	struct nfsmount *nmp;
3745	struct nfsiod *niod;
3746	int started = 0;
3747
3748	FSDBG_TOP(552, nmp, 0, 0, 0);
3749again:
3750	if (((nmp = req->r_nmp)) == NULL)
3751		return;
3752	lck_mtx_lock(nfsiod_mutex);
3753	niod = nmp->nm_niod;
3754
3755	/* grab an nfsiod if we don't have one already */
3756	if (!niod) {
3757		niod = TAILQ_FIRST(&nfsiodfree);
3758		if (niod) {
3759			TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
3760			TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
3761			niod->niod_nmp = nmp;
3762		} else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
3763			/*
3764			 * Try starting a new thread.
3765			 * We may try a couple times if other callers
3766			 * get the new threads before we do.
3767			 */
3768			lck_mtx_unlock(nfsiod_mutex);
3769			started++;
3770			if (!nfsiod_start())
3771				goto again;
3772			lck_mtx_lock(nfsiod_mutex);
3773		}
3774	}
3775
3776	if (req->r_achain.tqe_next == NFSREQNOLIST)
3777		TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
3778
3779	/* If this mount doesn't already have an nfsiod working on it... */
3780	if (!nmp->nm_niod) {
3781		if (niod) { /* give it the nfsiod we just grabbed */
3782			nmp->nm_niod = niod;
3783			lck_mtx_unlock(nfsiod_mutex);
3784			wakeup(niod);
3785		} else if (nfsiod_thread_count > 0) {
3786			/* just queue it up on nfsiod mounts queue */
3787			TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
3788			lck_mtx_unlock(nfsiod_mutex);
3789		} else {
3790			printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
3791			lck_mtx_unlock(nfsiod_mutex);
3792			/* we have no other option but to be persistent */
3793			started = 0;
3794			goto again;
3795		}
3796	} else {
3797		lck_mtx_unlock(nfsiod_mutex);
3798	}
3799
3800	FSDBG_BOT(552, nmp, 0, 0, 0);
3801}
3802
3803/*
3804 * queue up async I/O request for resend
3805 */
3806void
3807nfs_asyncio_resend(struct nfsreq *req)
3808{
3809	struct nfsmount *nmp = req->r_nmp;
3810
3811	if (!nmp)
3812		return;
3813	nfs_gss_clnt_rpcdone(req);
3814	lck_mtx_lock(&nmp->nm_lock);
3815	if (!(req->r_flags & R_RESENDQ)) {
3816		TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
3817		req->r_flags |= R_RESENDQ;
3818	}
3819	nfs_mount_sock_thread_wake(nmp);
3820	lck_mtx_unlock(&nmp->nm_lock);
3821}
3822
3823/*
3824 * Read directory data into a buffer.
3825 *
3826 * Buffer will be filled (unless EOF is hit).
3827 * Buffers after this one may also be completely/partially filled.
3828 */
3829int
3830nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
3831{
3832	nfsnode_t np = bp->nb_np;
3833	struct nfsmount *nmp = NFSTONMP(np);
3834	int error = 0;
3835
3836	if (!nmp)
3837		return (ENXIO);
3838
3839	if (nmp->nm_vers < NFS_VER4)
3840		error = nfs3_readdir_rpc(np, bp, ctx);
3841	else
3842		error = nfs4_readdir_rpc(np, bp, ctx);
3843
3844	if (error && (error != NFSERR_DIRBUFDROPPED)) {
3845		SET(bp->nb_flags, NB_ERROR);
3846		bp->nb_error = error;
3847	}
3848	return (error);
3849}
3850