nfs_bio.c revision 7871
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.5 (Berkeley) 1/4/94
37 * $Id: nfs_bio.c,v 1.11 1995/03/04 03:24:34 davidg Exp $
38 */
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/resourcevar.h>
43#include <sys/signalvar.h>
44#include <sys/proc.h>
45#include <sys/buf.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/kernel.h>
49
50#include <vm/vm.h>
51
52#include <nfs/nfsnode.h>
53#include <nfs/rpcv2.h>
54#include <nfs/nfsv2.h>
55#include <nfs/nfs.h>
56#include <nfs/nfsmount.h>
57#include <nfs/nqnfs.h>
58
59struct buf *nfs_getcacheblk();
60extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
61extern int nfs_numasync;
62
63/*
64 * Vnode op for read using bio
65 * Any similarity to readip() is purely coincidental
66 */
67int
68nfs_bioread(vp, uio, ioflag, cred)
69	register struct vnode *vp;
70	register struct uio *uio;
71	int ioflag;
72	struct ucred *cred;
73{
74	register struct nfsnode *np = VTONFS(vp);
75	register int biosize, diff;
76	struct buf *bp = 0, *rabp;
77	struct vattr vattr;
78	struct proc *p;
79	struct nfsmount *nmp;
80	daddr_t lbn, rabn;
81	int nra, error = 0, n = 0, on = 0, not_readin;
82
83#ifdef lint
84	ioflag = ioflag;
85#endif /* lint */
86#ifdef DIAGNOSTIC
87	if (uio->uio_rw != UIO_READ)
88		panic("nfs_read mode");
89#endif
90	if (uio->uio_resid == 0)
91		return (0);
92	if (uio->uio_offset < 0 && vp->v_type != VDIR)
93		return (EINVAL);
94	nmp = VFSTONFS(vp->v_mount);
95	biosize = NFS_MAXDGRAMDATA;
96	p = uio->uio_procp;
97	/*
98	 * For nfs, cache consistency can only be maintained approximately.
99	 * Although RFC1094 does not specify the criteria, the following is
100	 * believed to be compatible with the reference port.
101	 * For nqnfs, full cache consistency is maintained within the loop.
102	 * For nfs:
103	 * If the file's modify time on the server has changed since the
104	 * last read rpc or you have written to the file,
105	 * you may have lost data cache consistency with the
106	 * server, so flush all of the file's data out of the cache.
107	 * Then force a getattr rpc to ensure that you have up to date
108	 * attributes.
109	 * The mount flag NFSMNT_MYWRITE says "Assume that my writes are
110	 * the ones changing the modify time.
111	 * NB: This implies that cache data can be read when up to
112	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
113	 * attributes this could be forced by setting n_attrstamp to 0 before
114	 * the VOP_GETATTR() call.
115	 */
116	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) {
117		if (np->n_flag & NMODIFIED) {
118			if ((nmp->nm_flag & NFSMNT_MYWRITE) == 0 ||
119			     vp->v_type != VREG) {
120				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
121				if (error)
122					return (error);
123			}
124			np->n_attrstamp = 0;
125			np->n_direofoffset = 0;
126			error = VOP_GETATTR(vp, &vattr, cred, p);
127			if (error)
128				return (error);
129			np->n_mtime = vattr.va_mtime.ts_sec;
130		} else {
131			error = VOP_GETATTR(vp, &vattr, cred, p);
132			if (error)
133				return (error);
134			if (np->n_mtime != vattr.va_mtime.ts_sec) {
135				np->n_direofoffset = 0;
136				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
137				if (error)
138					return (error);
139				np->n_mtime = vattr.va_mtime.ts_sec;
140			}
141		}
142	}
143	do {
144
145	    /*
146	     * Get a valid lease. If cached data is stale, flush it.
147	     */
148	    if (nmp->nm_flag & NFSMNT_NQNFS) {
149		if (NQNFS_CKINVALID(vp, np, NQL_READ)) {
150		    do {
151			error = nqnfs_getlease(vp, NQL_READ, cred, p);
152		    } while (error == NQNFS_EXPIRED);
153		    if (error)
154			return (error);
155		    if (np->n_lrev != np->n_brev ||
156			(np->n_flag & NQNFSNONCACHE) ||
157			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
158			if (vp->v_type == VDIR) {
159			    np->n_direofoffset = 0;
160			    cache_purge(vp);
161			}
162			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
163			if (error)
164			    return (error);
165			np->n_brev = np->n_lrev;
166		    }
167		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
168		    np->n_direofoffset = 0;
169		    cache_purge(vp);
170		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
171		    if (error)
172			return (error);
173		}
174	    }
175	    if (np->n_flag & NQNFSNONCACHE) {
176		switch (vp->v_type) {
177		case VREG:
178			error = nfs_readrpc(vp, uio, cred);
179			break;
180		case VLNK:
181			error = nfs_readlinkrpc(vp, uio, cred);
182			break;
183		case VDIR:
184			error = nfs_readdirrpc(vp, uio, cred);
185			break;
186		default:
187			printf(" NQNFSNONCACHE: type %x unexpected\n",
188				vp->v_type);
189			break;
190		};
191		return (error);
192	    }
193	    switch (vp->v_type) {
194	    case VREG:
195		nfsstats.biocache_reads++;
196		lbn = uio->uio_offset / biosize;
197		on = uio->uio_offset & (biosize-1);
198		not_readin = 1;
199
200		/*
201		 * Start the read ahead(s), as required.
202		 */
203		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
204		    lbn == vp->v_lastr + 1) {
205		    for (nra = 0; nra < nmp->nm_readahead &&
206			(lbn + 1 + nra) * biosize < np->n_size; nra++) {
207			rabn = lbn + 1 + nra;
208			if (!incore(vp, rabn)) {
209			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
210			    if (!rabp)
211				return (EINTR);
212			    if ((rabp->b_flags & B_DELWRI) == 0) {
213				rabp->b_flags |= (B_READ | B_ASYNC);
214				vfs_busy_pages(rabp, 0);
215				if (nfs_asyncio(rabp, cred)) {
216				    rabp->b_flags |= B_INVAL|B_ERROR;
217				    vfs_unbusy_pages(rabp);
218				    brelse(rabp);
219				}
220			    } else {
221				brelse(rabp);
222			    }
223			}
224		    }
225		}
226
227		/*
228		 * If the block is in the cache and has the required data
229		 * in a valid region, just copy it out.
230		 * Otherwise, get the block and write back/read in,
231		 * as required.
232		 */
233again:
234		bp = nfs_getcacheblk(vp, lbn, biosize, p);
235		if (!bp)
236			return (EINTR);
237		if ((bp->b_flags & B_CACHE) == 0) {
238			bp->b_flags |= B_READ;
239			not_readin = 0;
240			vfs_busy_pages(bp, 0);
241			error = nfs_doio(bp, cred, p);
242			if (error) {
243			    brelse(bp);
244			    return (error);
245			}
246		}
247		n = min((unsigned)(biosize - on), uio->uio_resid);
248		diff = np->n_size - uio->uio_offset;
249		if (diff < n)
250			n = diff;
251		if (not_readin && n > 0) {
252			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
253				bp->b_flags |= B_NOCACHE;
254				if (bp->b_dirtyend > 0) {
255				    if ((bp->b_flags & B_DELWRI) == 0)
256					panic("nfsbioread");
257				    if (VOP_BWRITE(bp) == EINTR)
258					return (EINTR);
259				} else
260				    brelse(bp);
261				goto again;
262			}
263		}
264		vp->v_lastr = lbn;
265		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
266		if (diff < n)
267			n = diff;
268		break;
269	    case VLNK:
270		nfsstats.biocache_readlinks++;
271		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
272		if (!bp)
273			return (EINTR);
274		if ((bp->b_flags & B_CACHE) == 0) {
275			bp->b_flags |= B_READ;
276			vfs_busy_pages(bp, 0);
277			error = nfs_doio(bp, cred, p);
278			if (error) {
279				bp->b_flags |= B_ERROR;
280				brelse(bp);
281				return (error);
282			}
283		}
284		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
285		on = 0;
286		break;
287	    case VDIR:
288		nfsstats.biocache_readdirs++;
289		lbn = (daddr_t)uio->uio_offset;
290		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
291		if (!bp)
292			return (EINTR);
293
294		if ((bp->b_flags & B_CACHE) == 0) {
295			bp->b_flags |= B_READ;
296			vfs_busy_pages(bp, 0);
297			error = nfs_doio(bp, cred, p);
298			if (error) {
299				bp->b_flags |= B_ERROR;
300				brelse(bp);
301				return (error);
302			}
303		}
304
305		/*
306		 * If not eof and read aheads are enabled, start one.
307		 * (You need the current block first, so that you have the
308		 *  directory offset cookie of the next block.
309		 */
310		rabn = bp->b_blkno;
311		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
312		    rabn != 0 && rabn != np->n_direofoffset &&
313		    !incore(vp, rabn)) {
314			rabp = nfs_getcacheblk(vp, rabn, NFS_DIRBLKSIZ, p);
315			if (rabp) {
316			    if ((rabp->b_flags & B_CACHE) == 0) {
317				rabp->b_flags |= (B_READ | B_ASYNC);
318				vfs_busy_pages(rabp, 0);
319				if (nfs_asyncio(rabp, cred)) {
320				    rabp->b_flags |= B_INVAL|B_ERROR;
321				    vfs_unbusy_pages(rabp);
322				    brelse(rabp);
323				}
324			    } else {
325				brelse(rabp);
326			    }
327			}
328		}
329		on = 0;
330		n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid);
331		break;
332	    default:
333		printf(" nfsbioread: type %x unexpected\n",vp->v_type);
334		break;
335	    };
336
337	    if (n > 0) {
338		error = uiomove(bp->b_data + on, (int)n, uio);
339	    }
340	    switch (vp->v_type) {
341	    case VREG:
342		break;
343	    case VLNK:
344		n = 0;
345		break;
346	    case VDIR:
347		uio->uio_offset = bp->b_blkno;
348		break;
349	    default:
350		printf(" nfsbioread: type %x unexpected\n",vp->v_type);
351		break;
352	    }
353 	    brelse(bp);
354	} while (error == 0 && uio->uio_resid > 0 && n > 0);
355	return (error);
356}
357
358/*
359 * Vnode op for write using bio
360 */
361int
362nfs_write(ap)
363	struct vop_write_args /* {
364		struct vnode *a_vp;
365		struct uio *a_uio;
366		int  a_ioflag;
367		struct ucred *a_cred;
368	} */ *ap;
369{
370	register int biosize;
371	register struct uio *uio = ap->a_uio;
372	struct proc *p = uio->uio_procp;
373	register struct vnode *vp = ap->a_vp;
374	struct nfsnode *np = VTONFS(vp);
375	register struct ucred *cred = ap->a_cred;
376	int ioflag = ap->a_ioflag;
377	struct buf *bp;
378	struct vattr vattr;
379	struct nfsmount *nmp;
380	daddr_t lbn;
381	int n, on, error = 0;
382
383#ifdef DIAGNOSTIC
384	if (uio->uio_rw != UIO_WRITE)
385		panic("nfs_write mode");
386	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
387		panic("nfs_write proc");
388#endif
389	if (vp->v_type != VREG)
390		return (EIO);
391	if (np->n_flag & NWRITEERR) {
392		np->n_flag &= ~NWRITEERR;
393		return (np->n_error);
394	}
395	if (ioflag & (IO_APPEND | IO_SYNC)) {
396		if (np->n_flag & NMODIFIED) {
397			np->n_attrstamp = 0;
398			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
399			if (error)
400				return (error);
401		}
402		if (ioflag & IO_APPEND) {
403			np->n_attrstamp = 0;
404			error = VOP_GETATTR(vp, &vattr, cred, p);
405			if (error)
406				return (error);
407			uio->uio_offset = np->n_size;
408		}
409	}
410	nmp = VFSTONFS(vp->v_mount);
411	if (uio->uio_offset < 0)
412		return (EINVAL);
413	if (uio->uio_resid == 0)
414		return (0);
415	/*
416	 * Maybe this should be above the vnode op call, but so long as
417	 * file servers have no limits, i don't think it matters
418	 */
419	if (p && uio->uio_offset + uio->uio_resid >
420	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
421		psignal(p, SIGXFSZ);
422		return (EFBIG);
423	}
424	/*
425	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
426	 * will be the same size within a filesystem. nfs_writerpc will
427	 * still use nm_wsize when sizing the rpc's.
428	 */
429	biosize = NFS_MAXDGRAMDATA;
430	do {
431
432		/*
433		 * XXX make sure we aren't cached in the VM page cache
434		 */
435		/*
436		 * Check for a valid write lease.
437		 * If non-cachable, just do the rpc
438		 */
439		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
440		    NQNFS_CKINVALID(vp, np, NQL_WRITE)) {
441			do {
442				error = nqnfs_getlease(vp, NQL_WRITE, cred, p);
443			} while (error == NQNFS_EXPIRED);
444			if (error)
445				return (error);
446			if (np->n_lrev != np->n_brev ||
447			    (np->n_flag & NQNFSNONCACHE)) {
448				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
449				if (error)
450					return (error);
451				np->n_brev = np->n_lrev;
452			}
453		}
454		if (np->n_flag & NQNFSNONCACHE)
455			return (nfs_writerpc(vp, uio, cred, ioflag));
456		nfsstats.biocache_writes++;
457		lbn = uio->uio_offset / biosize;
458		on = uio->uio_offset & (biosize-1);
459		n = min((unsigned)(biosize - on), uio->uio_resid);
460again:
461		bp = nfs_getcacheblk(vp, lbn, biosize, p);
462		if (!bp)
463			return (EINTR);
464		if (bp->b_wcred == NOCRED) {
465			crhold(cred);
466			bp->b_wcred = cred;
467		}
468		np->n_flag |= NMODIFIED;
469		if (uio->uio_offset + n > np->n_size) {
470			np->n_size = uio->uio_offset + n;
471			vnode_pager_setsize(vp, (u_long)np->n_size);
472		}
473
474		/*
475		 * If the new write will leave a contiguous dirty
476		 * area, just update the b_dirtyoff and b_dirtyend,
477		 * otherwise force a write rpc of the old dirty area.
478		 */
479		if (bp->b_dirtyend > 0 &&
480		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
481			bp->b_proc = p;
482			if (VOP_BWRITE(bp) == EINTR)
483				return (EINTR);
484			goto again;
485		}
486
487		/*
488		 * Check for valid write lease and get one as required.
489		 * In case getblk() and/or bwrite() delayed us.
490		 */
491		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
492		    NQNFS_CKINVALID(vp, np, NQL_WRITE)) {
493			do {
494				error = nqnfs_getlease(vp, NQL_WRITE, cred, p);
495			} while (error == NQNFS_EXPIRED);
496			if (error) {
497				brelse(bp);
498				return (error);
499			}
500			if (np->n_lrev != np->n_brev ||
501			    (np->n_flag & NQNFSNONCACHE)) {
502				brelse(bp);
503				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
504				if (error)
505					return (error);
506				np->n_brev = np->n_lrev;
507				goto again;
508			}
509		}
510		error = uiomove((char *)bp->b_data + on, n, uio);
511		if (error) {
512			bp->b_flags |= B_ERROR;
513			brelse(bp);
514			return (error);
515		}
516		if (bp->b_dirtyend > 0) {
517			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
518			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
519		} else {
520			bp->b_dirtyoff = on;
521			bp->b_dirtyend = on + n;
522		}
523#ifndef notdef
524		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
525		    bp->b_validoff > bp->b_dirtyend) {
526			bp->b_validoff = bp->b_dirtyoff;
527			bp->b_validend = bp->b_dirtyend;
528		} else {
529			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
530			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
531		}
532#else
533		bp->b_validoff = bp->b_dirtyoff;
534		bp->b_validend = bp->b_dirtyend;
535#endif
536		if (ioflag & IO_APPEND)
537			bp->b_flags |= B_APPENDWRITE;
538
539		/*
540		 * If the lease is non-cachable or IO_SYNC do bwrite().
541		 */
542		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
543			bp->b_proc = p;
544			error = VOP_BWRITE(bp);
545			if (error)
546				return (error);
547		} else if ((n + on) == biosize &&
548			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
549			bp->b_proc = (struct proc *)0;
550			bawrite(bp);
551		} else
552			bdwrite(bp);
553	} while (uio->uio_resid > 0 && n > 0);
554	return (0);
555}
556
557/*
558 * Get an nfs cache block.
559 * Allocate a new one if the block isn't currently in the cache
560 * and return the block marked busy. If the calling process is
561 * interrupted by a signal for an interruptible mount point, return
562 * NULL.
563 */
564struct buf *
565nfs_getcacheblk(vp, bn, size, p)
566	struct vnode *vp;
567	daddr_t bn;
568	int size;
569	struct proc *p;
570{
571	register struct buf *bp;
572	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
573
574	if (nmp->nm_flag & NFSMNT_INT) {
575		bp = getblk(vp, bn, size, PCATCH, 0);
576		while (bp == (struct buf *)0) {
577			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
578				return ((struct buf *)0);
579			bp = getblk(vp, bn, size, 0, 2 * hz);
580		}
581	} else
582		bp = getblk(vp, bn, size, 0, 0);
583
584	if( vp->v_type == VREG)
585		bp->b_blkno = (bn * NFS_MAXDGRAMDATA) / DEV_BSIZE;
586
587	return (bp);
588}
589
590/*
591 * Flush and invalidate all dirty buffers. If another process is already
592 * doing the flush, just wait for completion.
593 */
594int
595nfs_vinvalbuf(vp, flags, cred, p, intrflg)
596	struct vnode *vp;
597	int flags;
598	struct ucred *cred;
599	struct proc *p;
600	int intrflg;
601{
602	register struct nfsnode *np = VTONFS(vp);
603	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
604	int error = 0, slpflag, slptimeo;
605
606	if ((nmp->nm_flag & NFSMNT_INT) == 0)
607		intrflg = 0;
608	if (intrflg) {
609		slpflag = PCATCH;
610		slptimeo = 2 * hz;
611	} else {
612		slpflag = 0;
613		slptimeo = 0;
614	}
615	/*
616	 * First wait for any other process doing a flush to complete.
617	 */
618	while (np->n_flag & NFLUSHINPROG) {
619		np->n_flag |= NFLUSHWANT;
620		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
621			slptimeo);
622		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
623			return (EINTR);
624	}
625
626	/*
627	 * Now, flush as required.
628	 */
629	np->n_flag |= NFLUSHINPROG;
630	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
631	while (error) {
632		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
633			np->n_flag &= ~NFLUSHINPROG;
634			if (np->n_flag & NFLUSHWANT) {
635				np->n_flag &= ~NFLUSHWANT;
636				wakeup((caddr_t)&np->n_flag);
637			}
638			return (EINTR);
639		}
640		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
641	}
642	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
643	if (np->n_flag & NFLUSHWANT) {
644		np->n_flag &= ~NFLUSHWANT;
645		wakeup((caddr_t)&np->n_flag);
646	}
647	return (0);
648}
649
650/*
651 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
652 * This is mainly to avoid queueing async I/O requests when the nfsiods
653 * are all hung on a dead server.
654 */
655int
656nfs_asyncio(bp, cred)
657	register struct buf *bp;
658	struct ucred *cred;
659{
660	register int i;
661
662	if (nfs_numasync == 0)
663		return (EIO);
664	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
665	    if (nfs_iodwant[i]) {
666		if (bp->b_flags & B_READ) {
667			if (bp->b_rcred == NOCRED && cred != NOCRED) {
668				crhold(cred);
669				bp->b_rcred = cred;
670			}
671		} else {
672			if (bp->b_wcred == NOCRED && cred != NOCRED) {
673				crhold(cred);
674				bp->b_wcred = cred;
675			}
676		}
677
678		TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist);
679		nfs_iodwant[i] = (struct proc *)0;
680		wakeup((caddr_t)&nfs_iodwant[i]);
681		return (0);
682	    }
683	return (EIO);
684}
685
686/*
687 * Do an I/O operation to/from a cache block. This may be called
688 * synchronously or from an nfsiod.
689 */
690int
691nfs_doio(bp, cr, p)
692	register struct buf *bp;
693	struct ucred *cr;
694	struct proc *p;
695{
696	register struct uio *uiop;
697	register struct vnode *vp;
698	struct nfsnode *np;
699	struct nfsmount *nmp;
700	int error = 0, diff, len;
701	struct uio uio;
702	struct iovec io;
703
704	vp = bp->b_vp;
705	np = VTONFS(vp);
706	nmp = VFSTONFS(vp->v_mount);
707	uiop = &uio;
708	uiop->uio_iov = &io;
709	uiop->uio_iovcnt = 1;
710	uiop->uio_segflg = UIO_SYSSPACE;
711	uiop->uio_procp = p;
712
713	/*
714	 * Historically, paging was done with physio, but no more.
715	 */
716	if (bp->b_flags & B_PHYS) {
717	    /*
718	     * ...though reading /dev/drum still gets us here.
719	     */
720	    io.iov_len = uiop->uio_resid = bp->b_bcount;
721	    /* mapping was done by vmapbuf() */
722	    io.iov_base = bp->b_data;
723	    uiop->uio_offset = bp->b_blkno * DEV_BSIZE;
724	    if (bp->b_flags & B_READ) {
725		uiop->uio_rw = UIO_READ;
726		nfsstats.read_physios++;
727		error = nfs_readrpc(vp, uiop, cr);
728	    } else {
729		uiop->uio_rw = UIO_WRITE;
730		nfsstats.write_physios++;
731		error = nfs_writerpc(vp, uiop, cr,0);
732	    }
733	    if (error) {
734		bp->b_flags |= B_ERROR;
735		bp->b_error = error;
736	    }
737	} else if (bp->b_flags & B_READ) {
738	    io.iov_len = uiop->uio_resid = bp->b_bcount;
739	    io.iov_base = bp->b_data;
740	    uiop->uio_rw = UIO_READ;
741	    switch (vp->v_type) {
742	    case VREG:
743		uiop->uio_offset = bp->b_blkno * DEV_BSIZE;
744		nfsstats.read_bios++;
745		error = nfs_readrpc(vp, uiop, cr);
746		if (!error) {
747		    bp->b_validoff = 0;
748		    if (uiop->uio_resid) {
749			/*
750			 * If len > 0, there is a hole in the file and
751			 * no writes after the hole have been pushed to
752			 * the server yet.
753			 * Just zero fill the rest of the valid area.
754			 */
755			diff = bp->b_bcount - uiop->uio_resid;
756			len = np->n_size - (bp->b_blkno * DEV_BSIZE
757				+ diff);
758			if (len > 0) {
759			    len = min(len, uiop->uio_resid);
760			    bzero((char *)bp->b_data + diff, len);
761			    bp->b_validend = diff + len;
762			} else
763			    bp->b_validend = diff;
764		    } else
765			bp->b_validend = bp->b_bcount;
766		}
767		if (p && (vp->v_flag & VTEXT) &&
768			(((nmp->nm_flag & NFSMNT_NQNFS) &&
769			  NQNFS_CKINVALID(vp, np, NQL_READ) &&
770			  np->n_lrev != np->n_brev) ||
771			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
772			  np->n_mtime != np->n_vattr.va_mtime.ts_sec))) {
773			uprintf("Process killed due to text file modification\n");
774			psignal(p, SIGKILL);
775			p->p_flag |= P_NOSWAP;
776		}
777		break;
778	    case VLNK:
779		uiop->uio_offset = 0;
780		nfsstats.readlink_bios++;
781		error = nfs_readlinkrpc(vp, uiop, cr);
782		break;
783	    case VDIR:
784		uiop->uio_offset = bp->b_lblkno;
785		nfsstats.readdir_bios++;
786		if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS)
787		    error = nfs_readdirlookrpc(vp, uiop, cr);
788		else
789		    error = nfs_readdirrpc(vp, uiop, cr);
790		/*
791		 * Save offset cookie in b_blkno.
792		 */
793		bp->b_blkno = uiop->uio_offset;
794		break;
795	    default:
796		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
797		break;
798	    };
799	    if (error) {
800		bp->b_flags |= B_ERROR;
801		bp->b_error = error;
802	    }
803	} else {
804	    io.iov_len = uiop->uio_resid = bp->b_dirtyend
805		- bp->b_dirtyoff;
806	    uiop->uio_offset = (bp->b_blkno * DEV_BSIZE)
807		+ bp->b_dirtyoff;
808	    io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
809	    uiop->uio_rw = UIO_WRITE;
810	    nfsstats.write_bios++;
811	    if (bp->b_flags & B_APPENDWRITE)
812		error = nfs_writerpc(vp, uiop, cr, IO_APPEND);
813	    else
814		error = nfs_writerpc(vp, uiop, cr, 0);
815	    bp->b_flags &= ~(B_WRITEINPROG | B_APPENDWRITE);
816
817	    /*
818	     * For an interrupted write, the buffer is still valid and the
819	     * write hasn't been pushed to the server yet, so we can't set
820	     * B_ERROR and report the interruption by setting B_EINTR. For
821	     * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt
822	     * is essentially a noop.
823	     */
824	    if (error == EINTR) {
825		bp->b_flags &= ~B_INVAL;
826		bp->b_flags |= B_DELWRI;
827
828		/*
829		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
830		 * buffer to the clean list, we have to reassign it back to the
831		 * dirty one. Ugh.
832		 */
833		if (bp->b_flags & B_ASYNC)
834		    reassignbuf(bp, vp);
835		else
836		    bp->b_flags |= B_EINTR;
837	    } else {
838		if (error) {
839		    bp->b_flags |= B_ERROR;
840		    bp->b_error = np->n_error = error;
841		    np->n_flag |= NWRITEERR;
842		}
843		bp->b_dirtyoff = bp->b_dirtyend = 0;
844	    }
845	}
846	bp->b_resid = uiop->uio_resid;
847	biodone(bp);
848	return (error);
849}
850