nfs_bio.c revision 19070
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.5 (Berkeley) 1/4/94
37 * $Id: nfs_bio.c,v 1.27 1996/10/12 17:39:39 bde Exp $
38 */
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/resourcevar.h>
43#include <sys/signalvar.h>
44#include <sys/proc.h>
45#include <sys/buf.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/kernel.h>
49#include <sys/sysctl.h>
50
51#include <vm/vm.h>
52#include <vm/vm_param.h>
53#include <vm/vm_extern.h>
54
55#include <nfs/rpcv2.h>
56#include <nfs/nfsproto.h>
57#include <nfs/nfs.h>
58#include <nfs/nfsmount.h>
59#include <nfs/nqnfs.h>
60#include <nfs/nfsnode.h>
61
62static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
63					struct proc *p));
64
65extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
66extern int nfs_numasync;
67extern struct nfsstats nfsstats;
68
69static int nfs_dwrite = 1;
70SYSCTL_INT(_vfs_nfs, OID_AUTO, dwrite, CTLFLAG_RW, &nfs_dwrite, 0, "");
71
72/*
73 * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate
74 * that this isn't done inside getblk() and brelse() so these calls
75 * wouldn't need to be here.
76 */
77#ifdef B_VMIO
78#define vnode_pager_uncache(vp)
79#else
80#define vfs_busy_pages(bp, f)
81#define vfs_unbusy_pages(bp)
82#define vfs_dirty_pages(bp)
83#endif
84
85/*
86 * Vnode op for read using bio
87 * Any similarity to readip() is purely coincidental
88 */
89int
90nfs_bioread(vp, uio, ioflag, cred)
91	register struct vnode *vp;
92	register struct uio *uio;
93	int ioflag;
94	struct ucred *cred;
95{
96	register struct nfsnode *np = VTONFS(vp);
97	register int biosize, diff, i;
98	struct buf *bp = 0, *rabp;
99	struct vattr vattr;
100	struct proc *p;
101	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
102	daddr_t lbn, rabn;
103	int bufsize;
104	int nra, error = 0, n = 0, on = 0, not_readin;
105
106#ifdef DIAGNOSTIC
107	if (uio->uio_rw != UIO_READ)
108		panic("nfs_read mode");
109#endif
110	if (uio->uio_resid == 0)
111		return (0);
112	if (uio->uio_offset < 0)
113		return (EINVAL);
114	p = uio->uio_procp;
115	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
116		(void)nfs_fsinfo(nmp, vp, cred, p);
117	biosize = vp->v_mount->mnt_stat.f_iosize;
118	/*
119	 * For nfs, cache consistency can only be maintained approximately.
120	 * Although RFC1094 does not specify the criteria, the following is
121	 * believed to be compatible with the reference port.
122	 * For nqnfs, full cache consistency is maintained within the loop.
123	 * For nfs:
124	 * If the file's modify time on the server has changed since the
125	 * last read rpc or you have written to the file,
126	 * you may have lost data cache consistency with the
127	 * server, so flush all of the file's data out of the cache.
128	 * Then force a getattr rpc to ensure that you have up to date
129	 * attributes.
130	 * NB: This implies that cache data can be read when up to
131	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
132	 * attributes this could be forced by setting n_attrstamp to 0 before
133	 * the VOP_GETATTR() call.
134	 */
135	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
136		if (np->n_flag & NMODIFIED) {
137			if (vp->v_type != VREG) {
138				if (vp->v_type != VDIR)
139					panic("nfs: bioread, not dir");
140				nfs_invaldir(vp);
141				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
142				if (error)
143					return (error);
144			}
145			np->n_attrstamp = 0;
146			error = VOP_GETATTR(vp, &vattr, cred, p);
147			if (error)
148				return (error);
149			np->n_mtime = vattr.va_mtime.tv_sec;
150		} else {
151			error = VOP_GETATTR(vp, &vattr, cred, p);
152			if (error)
153				return (error);
154			if (np->n_mtime != vattr.va_mtime.tv_sec) {
155				if (vp->v_type == VDIR)
156					nfs_invaldir(vp);
157				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
158				if (error)
159					return (error);
160				np->n_mtime = vattr.va_mtime.tv_sec;
161			}
162		}
163	}
164	do {
165
166	    /*
167	     * Get a valid lease. If cached data is stale, flush it.
168	     */
169	    if (nmp->nm_flag & NFSMNT_NQNFS) {
170		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
171		    do {
172			error = nqnfs_getlease(vp, ND_READ, cred, p);
173		    } while (error == NQNFS_EXPIRED);
174		    if (error)
175			return (error);
176		    if (np->n_lrev != np->n_brev ||
177			(np->n_flag & NQNFSNONCACHE) ||
178			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
179			if (vp->v_type == VDIR)
180			    nfs_invaldir(vp);
181			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
182			if (error)
183			    return (error);
184			np->n_brev = np->n_lrev;
185		    }
186		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
187		    nfs_invaldir(vp);
188		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
189		    if (error)
190			return (error);
191		}
192	    }
193	    if (np->n_flag & NQNFSNONCACHE) {
194		switch (vp->v_type) {
195		case VREG:
196			return (nfs_readrpc(vp, uio, cred));
197		case VLNK:
198			return (nfs_readlinkrpc(vp, uio, cred));
199		case VDIR:
200			break;
201		default:
202			printf(" NQNFSNONCACHE: type %x unexpected\n",
203				vp->v_type);
204		};
205	    }
206	    switch (vp->v_type) {
207	    case VREG:
208		nfsstats.biocache_reads++;
209		lbn = uio->uio_offset / biosize;
210		on = uio->uio_offset & (biosize - 1);
211		not_readin = 1;
212
213		/*
214		 * Start the read ahead(s), as required.
215		 */
216		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
217		    for (nra = 0; nra < nmp->nm_readahead &&
218			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
219			rabn = lbn + 1 + nra;
220			if (!incore(vp, rabn)) {
221			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
222			    if (!rabp)
223				return (EINTR);
224			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
225				rabp->b_flags |= (B_READ | B_ASYNC);
226				vfs_busy_pages(rabp, 0);
227				if (nfs_asyncio(rabp, cred)) {
228				    rabp->b_flags |= B_INVAL|B_ERROR;
229				    vfs_unbusy_pages(rabp);
230				    brelse(rabp);
231				}
232			    } else {
233				brelse(rabp);
234			    }
235			}
236		    }
237		}
238
239		/*
240		 * If the block is in the cache and has the required data
241		 * in a valid region, just copy it out.
242		 * Otherwise, get the block and write back/read in,
243		 * as required.
244		 */
245again:
246		bufsize = biosize;
247		if ((off_t)(lbn + 1) * biosize > np->n_size &&
248		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
249			bufsize = np->n_size - lbn * biosize;
250			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
251		}
252		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
253		if (!bp)
254			return (EINTR);
255		if ((bp->b_flags & B_CACHE) == 0) {
256			bp->b_flags |= B_READ;
257			bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
258			not_readin = 0;
259			vfs_busy_pages(bp, 0);
260			error = nfs_doio(bp, cred, p);
261			if (error) {
262			    brelse(bp);
263			    return (error);
264			}
265		}
266		if (bufsize > on) {
267			n = min((unsigned)(bufsize - on), uio->uio_resid);
268		} else {
269			n = 0;
270		}
271		diff = np->n_size - uio->uio_offset;
272		if (diff < n)
273			n = diff;
274		if (not_readin && n > 0) {
275			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
276				bp->b_flags |= B_NOCACHE;
277				if (bp->b_dirtyend > 0) {
278				    if ((bp->b_flags & B_DELWRI) == 0)
279					panic("nfsbioread");
280				    if (VOP_BWRITE(bp) == EINTR)
281					return (EINTR);
282				} else
283				    brelse(bp);
284				goto again;
285			}
286		}
287		vp->v_lastr = lbn;
288		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
289		if (diff < n)
290			n = diff;
291		break;
292	    case VLNK:
293		nfsstats.biocache_readlinks++;
294		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
295		if (!bp)
296			return (EINTR);
297		if ((bp->b_flags & B_CACHE) == 0) {
298			bp->b_flags |= B_READ;
299			vfs_busy_pages(bp, 0);
300			error = nfs_doio(bp, cred, p);
301			if (error) {
302				bp->b_flags |= B_ERROR;
303				brelse(bp);
304				return (error);
305			}
306		}
307		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
308		on = 0;
309		break;
310	    case VDIR:
311		nfsstats.biocache_readdirs++;
312		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
313		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
314		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
315		if (!bp)
316		    return (EINTR);
317		if ((bp->b_flags & B_CACHE) == 0) {
318		    bp->b_flags |= B_READ;
319		    vfs_busy_pages(bp, 0);
320		    error = nfs_doio(bp, cred, p);
321		    if (error) {
322		        vfs_unbusy_pages(bp);
323			brelse(bp);
324			while (error == NFSERR_BAD_COOKIE) {
325			    nfs_invaldir(vp);
326			    error = nfs_vinvalbuf(vp, 0, cred, p, 1);
327			    /*
328			     * Yuck! The directory has been modified on the
329			     * server. The only way to get the block is by
330			     * reading from the beginning to get all the
331			     * offset cookies.
332			     */
333			    for (i = 0; i <= lbn && !error; i++) {
334				bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
335				if (!bp)
336				    return (EINTR);
337				if ((bp->b_flags & B_DONE) == 0) {
338				    bp->b_flags |= B_READ;
339				    vfs_busy_pages(bp, 0);
340				    error = nfs_doio(bp, cred, p);
341				    if (error) {
342					vfs_unbusy_pages(bp);
343					brelse(bp);
344				    } else if (i < lbn)
345					brelse(bp);
346				}
347			    }
348			}
349			if (error)
350			    return (error);
351		    }
352		}
353
354		/*
355		 * If not eof and read aheads are enabled, start one.
356		 * (You need the current block first, so that you have the
357		 *  directory offset cookie of the next block.)
358		 */
359		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
360		    (np->n_direofoffset == 0 ||
361		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
362		    !(np->n_flag & NQNFSNONCACHE) &&
363		    !incore(vp, lbn + 1)) {
364			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
365			if (rabp) {
366			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
367				rabp->b_flags |= (B_READ | B_ASYNC);
368				vfs_busy_pages(rabp, 0);
369				if (nfs_asyncio(rabp, cred)) {
370				    rabp->b_flags |= B_INVAL|B_ERROR;
371				    vfs_unbusy_pages(rabp);
372				    brelse(rabp);
373				}
374			    } else {
375				brelse(rabp);
376			    }
377			}
378		}
379		n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
380		break;
381	    default:
382		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
383		break;
384	    };
385
386	    if (n > 0) {
387		error = uiomove(bp->b_data + on, (int)n, uio);
388	    }
389	    switch (vp->v_type) {
390	    case VREG:
391		break;
392	    case VLNK:
393		n = 0;
394		break;
395	    case VDIR:
396		if (np->n_flag & NQNFSNONCACHE)
397			bp->b_flags |= B_INVAL;
398		break;
399	    default:
400		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
401	    }
402 	    brelse(bp);
403	} while (error == 0 && uio->uio_resid > 0 && n > 0);
404	return (error);
405}
406
407/*
408 * Vnode op for write using bio
409 */
410int
411nfs_write(ap)
412	struct vop_write_args /* {
413		struct vnode *a_vp;
414		struct uio *a_uio;
415		int  a_ioflag;
416		struct ucred *a_cred;
417	} */ *ap;
418{
419	register int biosize;
420	register struct uio *uio = ap->a_uio;
421	struct proc *p = uio->uio_procp;
422	register struct vnode *vp = ap->a_vp;
423	struct nfsnode *np = VTONFS(vp);
424	register struct ucred *cred = ap->a_cred;
425	int ioflag = ap->a_ioflag;
426	struct buf *bp;
427	struct vattr vattr;
428	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
429	daddr_t lbn;
430	int bufsize;
431	int n, on, error = 0, iomode, must_commit;
432
433#ifdef DIAGNOSTIC
434	if (uio->uio_rw != UIO_WRITE)
435		panic("nfs_write mode");
436	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
437		panic("nfs_write proc");
438#endif
439	if (vp->v_type != VREG)
440		return (EIO);
441	if (np->n_flag & NWRITEERR) {
442		np->n_flag &= ~NWRITEERR;
443		return (np->n_error);
444	}
445	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
446		(void)nfs_fsinfo(nmp, vp, cred, p);
447	if (ioflag & (IO_APPEND | IO_SYNC)) {
448		if (np->n_flag & NMODIFIED) {
449			np->n_attrstamp = 0;
450			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
451			if (error)
452				return (error);
453		}
454		if (ioflag & IO_APPEND) {
455			np->n_attrstamp = 0;
456			error = VOP_GETATTR(vp, &vattr, cred, p);
457			if (error)
458				return (error);
459			uio->uio_offset = np->n_size;
460		}
461	}
462	if (uio->uio_offset < 0)
463		return (EINVAL);
464	if (uio->uio_resid == 0)
465		return (0);
466	/*
467	 * Maybe this should be above the vnode op call, but so long as
468	 * file servers have no limits, i don't think it matters
469	 */
470	if (p && uio->uio_offset + uio->uio_resid >
471	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
472		psignal(p, SIGXFSZ);
473		return (EFBIG);
474	}
475	/*
476	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
477	 * will be the same size within a filesystem. nfs_writerpc will
478	 * still use nm_wsize when sizing the rpc's.
479	 */
480	biosize = vp->v_mount->mnt_stat.f_iosize;
481	do {
482
483		/*
484		 * XXX make sure we aren't cached in the VM page cache
485		 */
486		/*
487		 * Check for a valid write lease.
488		 */
489		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
490		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
491			do {
492				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
493			} while (error == NQNFS_EXPIRED);
494			if (error)
495				return (error);
496			if (np->n_lrev != np->n_brev ||
497			    (np->n_flag & NQNFSNONCACHE)) {
498				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
499				if (error)
500					return (error);
501				np->n_brev = np->n_lrev;
502			}
503		}
504		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
505		    iomode = NFSV3WRITE_FILESYNC;
506		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
507		    if (must_commit)
508			nfs_clearcommit(vp->v_mount);
509		    return (error);
510		}
511		nfsstats.biocache_writes++;
512		lbn = uio->uio_offset / biosize;
513		on = uio->uio_offset & (biosize-1);
514		n = min((unsigned)(biosize - on), uio->uio_resid);
515again:
516		if (uio->uio_offset + n > np->n_size) {
517			np->n_size = uio->uio_offset + n;
518			vnode_pager_setsize(vp, (u_long)np->n_size);
519		}
520		bufsize = biosize;
521		if ((lbn + 1) * biosize > np->n_size) {
522			bufsize = np->n_size - lbn * biosize;
523			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
524		}
525		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
526		if (!bp)
527			return (EINTR);
528		if (bp->b_wcred == NOCRED) {
529			crhold(cred);
530			bp->b_wcred = cred;
531		}
532		np->n_flag |= NMODIFIED;
533
534		if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
535			bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
536		}
537
538		/*
539		 * If the new write will leave a contiguous dirty
540		 * area, just update the b_dirtyoff and b_dirtyend,
541		 * otherwise force a write rpc of the old dirty area.
542		 */
543		if (bp->b_dirtyend > 0 &&
544		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
545			bp->b_proc = p;
546			if (VOP_BWRITE(bp) == EINTR)
547				return (EINTR);
548			goto again;
549		}
550
551		/*
552		 * Check for valid write lease and get one as required.
553		 * In case getblk() and/or bwrite() delayed us.
554		 */
555		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
556		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
557			do {
558				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
559			} while (error == NQNFS_EXPIRED);
560			if (error) {
561				brelse(bp);
562				return (error);
563			}
564			if (np->n_lrev != np->n_brev ||
565			    (np->n_flag & NQNFSNONCACHE)) {
566				brelse(bp);
567				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
568				if (error)
569					return (error);
570				np->n_brev = np->n_lrev;
571				goto again;
572			}
573		}
574		error = uiomove((char *)bp->b_data + on, n, uio);
575		if (error) {
576			bp->b_flags |= B_ERROR;
577			brelse(bp);
578			return (error);
579		}
580		if (bp->b_dirtyend > 0) {
581			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
582			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
583		} else {
584			bp->b_dirtyoff = on;
585			bp->b_dirtyend = on + n;
586		}
587		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
588		    bp->b_validoff > bp->b_dirtyend) {
589			bp->b_validoff = bp->b_dirtyoff;
590			bp->b_validend = bp->b_dirtyend;
591		} else {
592			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
593			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
594		}
595
596		/*
597		 * Since this block is being modified, it must be written
598		 * again and not just committed.
599		 */
600		bp->b_flags &= ~B_NEEDCOMMIT;
601
602		/*
603		 * If the lease is non-cachable or IO_SYNC do bwrite().
604		 */
605		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
606			bp->b_proc = p;
607			error = VOP_BWRITE(bp);
608			if (error)
609				return (error);
610			if (np->n_flag & NQNFSNONCACHE) {
611				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
612				if (error)
613					return (error);
614			}
615		} else if ((n + on) == biosize &&
616			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
617			bp->b_proc = (struct proc *)0;
618			bp->b_flags |= B_ASYNC;
619			(void)nfs_writebp(bp, 0);
620		} else
621			bdwrite(bp);
622	} while (uio->uio_resid > 0 && n > 0);
623	return (0);
624}
625
626/*
627 * Get an nfs cache block.
628 * Allocate a new one if the block isn't currently in the cache
629 * and return the block marked busy. If the calling process is
630 * interrupted by a signal for an interruptible mount point, return
631 * NULL.
632 */
633static struct buf *
634nfs_getcacheblk(vp, bn, size, p)
635	struct vnode *vp;
636	daddr_t bn;
637	int size;
638	struct proc *p;
639{
640	register struct buf *bp;
641	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
642	int biosize = vp->v_mount->mnt_stat.f_iosize;
643
644	if (nmp->nm_flag & NFSMNT_INT) {
645		bp = getblk(vp, bn, size, PCATCH, 0);
646		while (bp == (struct buf *)0) {
647			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
648				return ((struct buf *)0);
649			bp = getblk(vp, bn, size, 0, 2 * hz);
650		}
651	} else
652		bp = getblk(vp, bn, size, 0, 0);
653
654	if( vp->v_type == VREG)
655		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
656
657	return (bp);
658}
659
660/*
661 * Flush and invalidate all dirty buffers. If another process is already
662 * doing the flush, just wait for completion.
663 */
664int
665nfs_vinvalbuf(vp, flags, cred, p, intrflg)
666	struct vnode *vp;
667	int flags;
668	struct ucred *cred;
669	struct proc *p;
670	int intrflg;
671{
672	register struct nfsnode *np = VTONFS(vp);
673	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
674	int error = 0, slpflag, slptimeo;
675
676	if ((nmp->nm_flag & NFSMNT_INT) == 0)
677		intrflg = 0;
678	if (intrflg) {
679		slpflag = PCATCH;
680		slptimeo = 2 * hz;
681	} else {
682		slpflag = 0;
683		slptimeo = 0;
684	}
685	/*
686	 * First wait for any other process doing a flush to complete.
687	 */
688	while (np->n_flag & NFLUSHINPROG) {
689		np->n_flag |= NFLUSHWANT;
690		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
691			slptimeo);
692		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
693			return (EINTR);
694	}
695
696	/*
697	 * Now, flush as required.
698	 */
699	np->n_flag |= NFLUSHINPROG;
700	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
701	while (error) {
702		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
703			np->n_flag &= ~NFLUSHINPROG;
704			if (np->n_flag & NFLUSHWANT) {
705				np->n_flag &= ~NFLUSHWANT;
706				wakeup((caddr_t)&np->n_flag);
707			}
708			return (EINTR);
709		}
710		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
711	}
712	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
713	if (np->n_flag & NFLUSHWANT) {
714		np->n_flag &= ~NFLUSHWANT;
715		wakeup((caddr_t)&np->n_flag);
716	}
717	return (0);
718}
719
720/*
721 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
722 * This is mainly to avoid queueing async I/O requests when the nfsiods
723 * are all hung on a dead server.
724 */
725int
726nfs_asyncio(bp, cred)
727	register struct buf *bp;
728	struct ucred *cred;
729{
730	register int i;
731
732	if (nfs_numasync == 0)
733		return (EIO);
734	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
735	    if (nfs_iodwant[i]) {
736		if (bp->b_flags & B_READ) {
737			if (bp->b_rcred == NOCRED && cred != NOCRED) {
738				crhold(cred);
739				bp->b_rcred = cred;
740			}
741		} else {
742			bp->b_flags |= B_WRITEINPROG;
743			if (bp->b_wcred == NOCRED && cred != NOCRED) {
744				crhold(cred);
745				bp->b_wcred = cred;
746			}
747		}
748
749		TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist);
750		nfs_iodwant[i] = (struct proc *)0;
751		wakeup((caddr_t)&nfs_iodwant[i]);
752		return (0);
753	    }
754
755	/*
756	 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE
757	 * return EIO so the process will call nfs_doio() and do it
758	 * synchronously.
759	 */
760	if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE))
761		return (EIO);
762
763	/*
764	 * Allow the administrator to override the choice of using a delayed
765	 * write since it is a pessimization for some servers, notably some
766	 * Solaris servers.
767	 */
768	if (!nfs_dwrite)
769		return (EIO);
770
771	/*
772	 * Just turn the async write into a delayed write, instead of
773	 * doing in synchronously. Hopefully, at least one of the nfsiods
774	 * is currently doing a write for this file and will pick up the
775	 * delayed writes before going back to sleep.
776	 */
777	bp->b_flags |= B_DELWRI;
778	reassignbuf(bp, bp->b_vp);
779	biodone(bp);
780	return (0);
781}
782
783/*
784 * Do an I/O operation to/from a cache block. This may be called
785 * synchronously or from an nfsiod.
786 */
787int
788nfs_doio(bp, cr, p)
789	register struct buf *bp;
790	struct ucred *cr;
791	struct proc *p;
792{
793	register struct uio *uiop;
794	register struct vnode *vp;
795	struct nfsnode *np;
796	struct nfsmount *nmp;
797	int error = 0, diff, len, iomode, must_commit = 0;
798	struct uio uio;
799	struct iovec io;
800
801	vp = bp->b_vp;
802	np = VTONFS(vp);
803	nmp = VFSTONFS(vp->v_mount);
804	uiop = &uio;
805	uiop->uio_iov = &io;
806	uiop->uio_iovcnt = 1;
807	uiop->uio_segflg = UIO_SYSSPACE;
808	uiop->uio_procp = p;
809
810	/*
811	 * Historically, paging was done with physio, but no more.
812	 */
813	if (bp->b_flags & B_PHYS) {
814	    /*
815	     * ...though reading /dev/drum still gets us here.
816	     */
817	    io.iov_len = uiop->uio_resid = bp->b_bcount;
818	    /* mapping was done by vmapbuf() */
819	    io.iov_base = bp->b_data;
820	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
821	    if (bp->b_flags & B_READ) {
822		uiop->uio_rw = UIO_READ;
823		nfsstats.read_physios++;
824		error = nfs_readrpc(vp, uiop, cr);
825	    } else {
826		int com;
827
828		iomode = NFSV3WRITE_DATASYNC;
829		uiop->uio_rw = UIO_WRITE;
830		nfsstats.write_physios++;
831		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
832	    }
833	    if (error) {
834		bp->b_flags |= B_ERROR;
835		bp->b_error = error;
836	    }
837	} else if (bp->b_flags & B_READ) {
838	    io.iov_len = uiop->uio_resid = bp->b_bcount;
839	    io.iov_base = bp->b_data;
840	    uiop->uio_rw = UIO_READ;
841	    switch (vp->v_type) {
842	    case VREG:
843		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
844		nfsstats.read_bios++;
845		error = nfs_readrpc(vp, uiop, cr);
846		if (!error) {
847		    bp->b_validoff = 0;
848		    if (uiop->uio_resid) {
849			/*
850			 * If len > 0, there is a hole in the file and
851			 * no writes after the hole have been pushed to
852			 * the server yet.
853			 * Just zero fill the rest of the valid area.
854			 */
855			diff = bp->b_bcount - uiop->uio_resid;
856			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
857				+ diff);
858			if (len > 0) {
859			    len = min(len, uiop->uio_resid);
860			    bzero((char *)bp->b_data + diff, len);
861			    bp->b_validend = diff + len;
862			} else
863			    bp->b_validend = diff;
864		    } else
865			bp->b_validend = bp->b_bcount;
866		}
867		if (p && (vp->v_flag & VTEXT) &&
868			(((nmp->nm_flag & NFSMNT_NQNFS) &&
869			  NQNFS_CKINVALID(vp, np, ND_READ) &&
870			  np->n_lrev != np->n_brev) ||
871			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
872			  np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
873			uprintf("Process killed due to text file modification\n");
874			psignal(p, SIGKILL);
875#ifdef __NetBSD__
876			p->p_holdcnt++;
877#else
878			p->p_flag |= P_NOSWAP;
879#endif
880		}
881		break;
882	    case VLNK:
883		uiop->uio_offset = (off_t)0;
884		nfsstats.readlink_bios++;
885		error = nfs_readlinkrpc(vp, uiop, cr);
886		break;
887	    case VDIR:
888		nfsstats.readdir_bios++;
889		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
890		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
891			error = nfs_readdirplusrpc(vp, uiop, cr);
892			if (error == NFSERR_NOTSUPP)
893				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
894		}
895		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
896			error = nfs_readdirrpc(vp, uiop, cr);
897		break;
898	    default:
899		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
900		break;
901	    };
902	    if (error) {
903		bp->b_flags |= B_ERROR;
904		bp->b_error = error;
905	    }
906	} else {
907	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
908		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
909
910	    if (bp->b_dirtyend > bp->b_dirtyoff) {
911		io.iov_len = uiop->uio_resid = bp->b_dirtyend
912		    - bp->b_dirtyoff;
913		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
914		    + bp->b_dirtyoff;
915		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
916		uiop->uio_rw = UIO_WRITE;
917		nfsstats.write_bios++;
918		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
919		    iomode = NFSV3WRITE_UNSTABLE;
920		else
921		    iomode = NFSV3WRITE_FILESYNC;
922		bp->b_flags |= B_WRITEINPROG;
923		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
924		if (!error && iomode == NFSV3WRITE_UNSTABLE)
925		    bp->b_flags |= B_NEEDCOMMIT;
926		else
927		    bp->b_flags &= ~B_NEEDCOMMIT;
928		bp->b_flags &= ~B_WRITEINPROG;
929
930		/*
931		 * For an interrupted write, the buffer is still valid
932		 * and the write hasn't been pushed to the server yet,
933		 * so we can't set B_ERROR and report the interruption
934		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
935		 * is not relevant, so the rpc attempt is essentially
936		 * a noop.  For the case of a V3 write rpc not being
937		 * committed to stable storage, the block is still
938		 * dirty and requires either a commit rpc or another
939		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
940		 * the block is reused. This is indicated by setting
941		 * the B_DELWRI and B_NEEDCOMMIT flags.
942		 */
943    		if (error == EINTR
944		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
945			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
946			bp->b_flags |= B_DELWRI;
947
948		/*
949		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
950		 * buffer to the clean list, we have to reassign it back to the
951		 * dirty one. Ugh.
952		 */
953			if (bp->b_flags & B_ASYNC)
954				reassignbuf(bp, vp);
955			else
956				bp->b_flags |= B_EINTR;
957	    	} else {
958			if (error) {
959				bp->b_flags |= B_ERROR;
960				bp->b_error = np->n_error = error;
961				np->n_flag |= NWRITEERR;
962			}
963			bp->b_dirtyoff = bp->b_dirtyend = 0;
964		}
965	    } else {
966		bp->b_resid = 0;
967		biodone(bp);
968		return (0);
969	    }
970	}
971	bp->b_resid = uiop->uio_resid;
972	if (must_commit)
973		nfs_clearcommit(vp->v_mount);
974	biodone(bp);
975	return (error);
976}
977