Deleted Added
full compact
nfs_bio.c (59249) nfs_bio.c (60041)
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
1/*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
37 * $FreeBSD: head/sys/nfsclient/nfs_bio.c 59249 2000-04-15 05:54:02Z phk $
37 * $FreeBSD: head/sys/nfsclient/nfs_bio.c 60041 2000-05-05 09:59:14Z phk $
38 */
39
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/proc.h>
38 */
39
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/proc.h>
46#include <sys/bio.h>
46#include <sys/buf.h>
47#include <sys/vnode.h>
48#include <sys/mount.h>
49#include <sys/kernel.h>
50
51#include <vm/vm.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_page.h>
54#include <vm/vm_object.h>
55#include <vm/vm_pager.h>
56#include <vm/vnode_pager.h>
57
58#include <nfs/rpcv2.h>
59#include <nfs/nfsproto.h>
60#include <nfs/nfs.h>
61#include <nfs/nfsmount.h>
62#include <nfs/nqnfs.h>
63#include <nfs/nfsnode.h>
64
65static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
66 struct proc *p));
67
68extern int nfs_numasync;
69extern int nfs_pbuf_freecnt;
70extern struct nfsstats nfsstats;
71
72/*
73 * Vnode op for VM getpages.
74 */
75int
76nfs_getpages(ap)
77 struct vop_getpages_args /* {
78 struct vnode *a_vp;
79 vm_page_t *a_m;
80 int a_count;
81 int a_reqpage;
82 vm_ooffset_t a_offset;
83 } */ *ap;
84{
85 int i, error, nextoff, size, toff, count, npages;
86 struct uio uio;
87 struct iovec iov;
88 vm_offset_t kva;
89 struct buf *bp;
90 struct vnode *vp;
91 struct proc *p;
92 struct ucred *cred;
93 struct nfsmount *nmp;
94 vm_page_t *pages;
95
96 vp = ap->a_vp;
97 p = curproc; /* XXX */
98 cred = curproc->p_ucred; /* XXX */
99 nmp = VFSTONFS(vp->v_mount);
100 pages = ap->a_m;
101 count = ap->a_count;
102
103 if (vp->v_object == NULL) {
104 printf("nfs_getpages: called with non-merged cache vnode??\n");
105 return VM_PAGER_ERROR;
106 }
107
108 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
109 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
110 (void)nfs_fsinfo(nmp, vp, cred, p);
111
112 npages = btoc(count);
113
114 /*
115 * If the requested page is partially valid, just return it and
116 * allow the pager to zero-out the blanks. Partially valid pages
117 * can only occur at the file EOF.
118 */
119
120 {
121 vm_page_t m = pages[ap->a_reqpage];
122
123 if (m->valid != 0) {
124 /* handled by vm_fault now */
125 /* vm_page_zero_invalid(m, TRUE); */
126 for (i = 0; i < npages; ++i) {
127 if (i != ap->a_reqpage)
128 vnode_pager_freepage(pages[i]);
129 }
130 return(0);
131 }
132 }
133
134 /*
135 * We use only the kva address for the buffer, but this is extremely
136 * convienient and fast.
137 */
138 bp = getpbuf(&nfs_pbuf_freecnt);
139
140 kva = (vm_offset_t) bp->b_data;
141 pmap_qenter(kva, pages, npages);
142
143 iov.iov_base = (caddr_t) kva;
144 iov.iov_len = count;
145 uio.uio_iov = &iov;
146 uio.uio_iovcnt = 1;
147 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
148 uio.uio_resid = count;
149 uio.uio_segflg = UIO_SYSSPACE;
150 uio.uio_rw = UIO_READ;
151 uio.uio_procp = p;
152
153 error = nfs_readrpc(vp, &uio, cred);
154 pmap_qremove(kva, npages);
155
156 relpbuf(bp, &nfs_pbuf_freecnt);
157
158 if (error && (uio.uio_resid == count)) {
159 printf("nfs_getpages: error %d\n", error);
160 for (i = 0; i < npages; ++i) {
161 if (i != ap->a_reqpage)
162 vnode_pager_freepage(pages[i]);
163 }
164 return VM_PAGER_ERROR;
165 }
166
167 /*
168 * Calculate the number of bytes read and validate only that number
169 * of bytes. Note that due to pending writes, size may be 0. This
170 * does not mean that the remaining data is invalid!
171 */
172
173 size = count - uio.uio_resid;
174
175 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
176 vm_page_t m;
177 nextoff = toff + PAGE_SIZE;
178 m = pages[i];
179
180 m->flags &= ~PG_ZERO;
181
182 if (nextoff <= size) {
183 /*
184 * Read operation filled an entire page
185 */
186 m->valid = VM_PAGE_BITS_ALL;
187 vm_page_undirty(m);
188 } else if (size > toff) {
189 /*
190 * Read operation filled a partial page.
191 */
192 m->valid = 0;
193 vm_page_set_validclean(m, 0, size - toff);
194 /* handled by vm_fault now */
195 /* vm_page_zero_invalid(m, TRUE); */
196 }
197
198 if (i != ap->a_reqpage) {
199 /*
200 * Whether or not to leave the page activated is up in
201 * the air, but we should put the page on a page queue
202 * somewhere (it already is in the object). Result:
203 * It appears that emperical results show that
204 * deactivating pages is best.
205 */
206
207 /*
208 * Just in case someone was asking for this page we
209 * now tell them that it is ok to use.
210 */
211 if (!error) {
212 if (m->flags & PG_WANTED)
213 vm_page_activate(m);
214 else
215 vm_page_deactivate(m);
216 vm_page_wakeup(m);
217 } else {
218 vnode_pager_freepage(m);
219 }
220 }
221 }
222 return 0;
223}
224
225/*
226 * Vnode op for VM putpages.
227 */
228int
229nfs_putpages(ap)
230 struct vop_putpages_args /* {
231 struct vnode *a_vp;
232 vm_page_t *a_m;
233 int a_count;
234 int a_sync;
235 int *a_rtvals;
236 vm_ooffset_t a_offset;
237 } */ *ap;
238{
239 struct uio uio;
240 struct iovec iov;
241 vm_offset_t kva;
242 struct buf *bp;
243 int iomode, must_commit, i, error, npages, count;
244 off_t offset;
245 int *rtvals;
246 struct vnode *vp;
247 struct proc *p;
248 struct ucred *cred;
249 struct nfsmount *nmp;
250 struct nfsnode *np;
251 vm_page_t *pages;
252
253 vp = ap->a_vp;
254 np = VTONFS(vp);
255 p = curproc; /* XXX */
256 cred = curproc->p_ucred; /* XXX */
257 nmp = VFSTONFS(vp->v_mount);
258 pages = ap->a_m;
259 count = ap->a_count;
260 rtvals = ap->a_rtvals;
261 npages = btoc(count);
262 offset = IDX_TO_OFF(pages[0]->pindex);
263
264 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
265 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
266 (void)nfs_fsinfo(nmp, vp, cred, p);
267
268 for (i = 0; i < npages; i++) {
269 rtvals[i] = VM_PAGER_AGAIN;
270 }
271
272 /*
273 * When putting pages, do not extend file past EOF.
274 */
275
276 if (offset + count > np->n_size) {
277 count = np->n_size - offset;
278 if (count < 0)
279 count = 0;
280 }
281
282 /*
283 * We use only the kva address for the buffer, but this is extremely
284 * convienient and fast.
285 */
286 bp = getpbuf(&nfs_pbuf_freecnt);
287
288 kva = (vm_offset_t) bp->b_data;
289 pmap_qenter(kva, pages, npages);
290
291 iov.iov_base = (caddr_t) kva;
292 iov.iov_len = count;
293 uio.uio_iov = &iov;
294 uio.uio_iovcnt = 1;
295 uio.uio_offset = offset;
296 uio.uio_resid = count;
297 uio.uio_segflg = UIO_SYSSPACE;
298 uio.uio_rw = UIO_WRITE;
299 uio.uio_procp = p;
300
301 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
302 iomode = NFSV3WRITE_UNSTABLE;
303 else
304 iomode = NFSV3WRITE_FILESYNC;
305
306 error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
307
308 pmap_qremove(kva, npages);
309 relpbuf(bp, &nfs_pbuf_freecnt);
310
311 if (!error) {
312 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
313 for (i = 0; i < nwritten; i++) {
314 rtvals[i] = VM_PAGER_OK;
315 vm_page_undirty(pages[i]);
316 }
317 if (must_commit)
318 nfs_clearcommit(vp->v_mount);
319 }
320 return rtvals[0];
321}
322
323/*
324 * Vnode op for read using bio
325 */
326int
327nfs_bioread(vp, uio, ioflag, cred)
328 register struct vnode *vp;
329 register struct uio *uio;
330 int ioflag;
331 struct ucred *cred;
332{
333 register struct nfsnode *np = VTONFS(vp);
334 register int biosize, i;
335 struct buf *bp = 0, *rabp;
336 struct vattr vattr;
337 struct proc *p;
338 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
339 daddr_t lbn, rabn;
340 int bcount;
341 int seqcount;
342 int nra, error = 0, n = 0, on = 0;
343
344#ifdef DIAGNOSTIC
345 if (uio->uio_rw != UIO_READ)
346 panic("nfs_read mode");
347#endif
348 if (uio->uio_resid == 0)
349 return (0);
350 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */
351 return (EINVAL);
352 p = uio->uio_procp;
353
354 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
355 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
356 (void)nfs_fsinfo(nmp, vp, cred, p);
357 if (vp->v_type != VDIR &&
358 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
359 return (EFBIG);
360 biosize = vp->v_mount->mnt_stat.f_iosize;
361 seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE);
362 /*
363 * For nfs, cache consistency can only be maintained approximately.
364 * Although RFC1094 does not specify the criteria, the following is
365 * believed to be compatible with the reference port.
366 * For nqnfs, full cache consistency is maintained within the loop.
367 * For nfs:
368 * If the file's modify time on the server has changed since the
369 * last read rpc or you have written to the file,
370 * you may have lost data cache consistency with the
371 * server, so flush all of the file's data out of the cache.
372 * Then force a getattr rpc to ensure that you have up to date
373 * attributes.
374 * NB: This implies that cache data can be read when up to
375 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
376 * attributes this could be forced by setting n_attrstamp to 0 before
377 * the VOP_GETATTR() call.
378 */
379 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
380 if (np->n_flag & NMODIFIED) {
381 if (vp->v_type != VREG) {
382 if (vp->v_type != VDIR)
383 panic("nfs: bioread, not dir");
384 nfs_invaldir(vp);
385 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
386 if (error)
387 return (error);
388 }
389 np->n_attrstamp = 0;
390 error = VOP_GETATTR(vp, &vattr, cred, p);
391 if (error)
392 return (error);
393 np->n_mtime = vattr.va_mtime.tv_sec;
394 } else {
395 error = VOP_GETATTR(vp, &vattr, cred, p);
396 if (error)
397 return (error);
398 if (np->n_mtime != vattr.va_mtime.tv_sec) {
399 if (vp->v_type == VDIR)
400 nfs_invaldir(vp);
401 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
402 if (error)
403 return (error);
404 np->n_mtime = vattr.va_mtime.tv_sec;
405 }
406 }
407 }
408 do {
409
410 /*
411 * Get a valid lease. If cached data is stale, flush it.
412 */
413 if (nmp->nm_flag & NFSMNT_NQNFS) {
414 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
415 do {
416 error = nqnfs_getlease(vp, ND_READ, cred, p);
417 } while (error == NQNFS_EXPIRED);
418 if (error)
419 return (error);
420 if (np->n_lrev != np->n_brev ||
421 (np->n_flag & NQNFSNONCACHE) ||
422 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
423 if (vp->v_type == VDIR)
424 nfs_invaldir(vp);
425 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
426 if (error)
427 return (error);
428 np->n_brev = np->n_lrev;
429 }
430 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
431 nfs_invaldir(vp);
432 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
433 if (error)
434 return (error);
435 }
436 }
437 if (np->n_flag & NQNFSNONCACHE) {
438 switch (vp->v_type) {
439 case VREG:
440 return (nfs_readrpc(vp, uio, cred));
441 case VLNK:
442 return (nfs_readlinkrpc(vp, uio, cred));
443 case VDIR:
444 break;
445 default:
446 printf(" NQNFSNONCACHE: type %x unexpected\n",
447 vp->v_type);
448 };
449 }
450 switch (vp->v_type) {
451 case VREG:
452 nfsstats.biocache_reads++;
453 lbn = uio->uio_offset / biosize;
454 on = uio->uio_offset & (biosize - 1);
455
456 /*
457 * Start the read ahead(s), as required.
458 */
459 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
460 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
461 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
462 rabn = lbn + 1 + nra;
463 if (!incore(vp, rabn)) {
464 rabp = nfs_getcacheblk(vp, rabn, biosize, p);
465 if (!rabp)
466 return (EINTR);
467 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
468 rabp->b_flags |= B_ASYNC;
469 rabp->b_iocmd = BIO_READ;
470 vfs_busy_pages(rabp, 0);
471 if (nfs_asyncio(rabp, cred, p)) {
472 rabp->b_flags |= B_INVAL;
473 rabp->b_ioflags |= BIO_ERROR;
474 vfs_unbusy_pages(rabp);
475 brelse(rabp);
476 break;
477 }
478 } else {
479 brelse(rabp);
480 }
481 }
482 }
483 }
484
485 /*
486 * Obtain the buffer cache block. Figure out the buffer size
487 * when we are at EOF. If we are modifying the size of the
488 * buffer based on an EOF condition we need to hold
489 * nfs_rslock() through obtaining the buffer to prevent
490 * a potential writer-appender from messing with n_size.
491 * Otherwise we may accidently truncate the buffer and
492 * lose dirty data.
493 *
494 * Note that bcount is *not* DEV_BSIZE aligned.
495 */
496
497again:
498 bcount = biosize;
499 if ((off_t)lbn * biosize >= np->n_size) {
500 bcount = 0;
501 } else if ((off_t)(lbn + 1) * biosize > np->n_size) {
502 bcount = np->n_size - (off_t)lbn * biosize;
503 }
504 if (bcount != biosize) {
505 switch(nfs_rslock(np, p)) {
506 case ENOLCK:
507 goto again;
508 /* not reached */
509 case EINTR:
510 case ERESTART:
511 return(EINTR);
512 /* not reached */
513 default:
514 break;
515 }
516 }
517
518 bp = nfs_getcacheblk(vp, lbn, bcount, p);
519
520 if (bcount != biosize)
521 nfs_rsunlock(np, p);
522 if (!bp)
523 return (EINTR);
524
525 /*
526 * If B_CACHE is not set, we must issue the read. If this
527 * fails, we return an error.
528 */
529
530 if ((bp->b_flags & B_CACHE) == 0) {
531 bp->b_iocmd = BIO_READ;
532 vfs_busy_pages(bp, 0);
533 error = nfs_doio(bp, cred, p);
534 if (error) {
535 brelse(bp);
536 return (error);
537 }
538 }
539
540 /*
541 * on is the offset into the current bp. Figure out how many
542 * bytes we can copy out of the bp. Note that bcount is
543 * NOT DEV_BSIZE aligned.
544 *
545 * Then figure out how many bytes we can copy into the uio.
546 */
547
548 n = 0;
549 if (on < bcount)
550 n = min((unsigned)(bcount - on), uio->uio_resid);
551 break;
552 case VLNK:
553 nfsstats.biocache_readlinks++;
554 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
555 if (!bp)
556 return (EINTR);
557 if ((bp->b_flags & B_CACHE) == 0) {
558 bp->b_iocmd = BIO_READ;
559 vfs_busy_pages(bp, 0);
560 error = nfs_doio(bp, cred, p);
561 if (error) {
562 bp->b_ioflags |= BIO_ERROR;
563 brelse(bp);
564 return (error);
565 }
566 }
567 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
568 on = 0;
569 break;
570 case VDIR:
571 nfsstats.biocache_readdirs++;
572 if (np->n_direofoffset
573 && uio->uio_offset >= np->n_direofoffset) {
574 return (0);
575 }
576 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
577 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
578 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
579 if (!bp)
580 return (EINTR);
581 if ((bp->b_flags & B_CACHE) == 0) {
582 bp->b_iocmd = BIO_READ;
583 vfs_busy_pages(bp, 0);
584 error = nfs_doio(bp, cred, p);
585 if (error) {
586 brelse(bp);
587 }
588 while (error == NFSERR_BAD_COOKIE) {
589 printf("got bad cookie vp %p bp %p\n", vp, bp);
590 nfs_invaldir(vp);
591 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
592 /*
593 * Yuck! The directory has been modified on the
594 * server. The only way to get the block is by
595 * reading from the beginning to get all the
596 * offset cookies.
597 *
598 * Leave the last bp intact unless there is an error.
599 * Loop back up to the while if the error is another
600 * NFSERR_BAD_COOKIE (double yuch!).
601 */
602 for (i = 0; i <= lbn && !error; i++) {
603 if (np->n_direofoffset
604 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
605 return (0);
606 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
607 if (!bp)
608 return (EINTR);
609 if ((bp->b_flags & B_CACHE) == 0) {
610 bp->b_iocmd = BIO_READ;
611 vfs_busy_pages(bp, 0);
612 error = nfs_doio(bp, cred, p);
613 /*
614 * no error + B_INVAL == directory EOF,
615 * use the block.
616 */
617 if (error == 0 && (bp->b_flags & B_INVAL))
618 break;
619 }
620 /*
621 * An error will throw away the block and the
622 * for loop will break out. If no error and this
623 * is not the block we want, we throw away the
624 * block and go for the next one via the for loop.
625 */
626 if (error || i < lbn)
627 brelse(bp);
628 }
629 }
630 /*
631 * The above while is repeated if we hit another cookie
632 * error. If we hit an error and it wasn't a cookie error,
633 * we give up.
634 */
635 if (error)
636 return (error);
637 }
638
639 /*
640 * If not eof and read aheads are enabled, start one.
641 * (You need the current block first, so that you have the
642 * directory offset cookie of the next block.)
643 */
644 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
645 (bp->b_flags & B_INVAL) == 0 &&
646 (np->n_direofoffset == 0 ||
647 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
648 !(np->n_flag & NQNFSNONCACHE) &&
649 !incore(vp, lbn + 1)) {
650 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
651 if (rabp) {
652 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
653 rabp->b_flags |= B_ASYNC;
654 rabp->b_iocmd = BIO_READ;
655 vfs_busy_pages(rabp, 0);
656 if (nfs_asyncio(rabp, cred, p)) {
657 rabp->b_flags |= B_INVAL;
658 rabp->b_ioflags |= BIO_ERROR;
659 vfs_unbusy_pages(rabp);
660 brelse(rabp);
661 }
662 } else {
663 brelse(rabp);
664 }
665 }
666 }
667 /*
668 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
669 * chopped for the EOF condition, we cannot tell how large
670 * NFS directories are going to be until we hit EOF. So
671 * an NFS directory buffer is *not* chopped to its EOF. Now,
672 * it just so happens that b_resid will effectively chop it
673 * to EOF. *BUT* this information is lost if the buffer goes
674 * away and is reconstituted into a B_CACHE state ( due to
675 * being VMIO ) later. So we keep track of the directory eof
676 * in np->n_direofoffset and chop it off as an extra step
677 * right here.
678 */
679 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
680 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
681 n = np->n_direofoffset - uio->uio_offset;
682 break;
683 default:
684 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
685 break;
686 };
687
688 if (n > 0) {
689 error = uiomove(bp->b_data + on, (int)n, uio);
690 }
691 switch (vp->v_type) {
692 case VREG:
693 break;
694 case VLNK:
695 n = 0;
696 break;
697 case VDIR:
698 /*
699 * Invalidate buffer if caching is disabled, forcing a
700 * re-read from the remote later.
701 */
702 if (np->n_flag & NQNFSNONCACHE)
703 bp->b_flags |= B_INVAL;
704 break;
705 default:
706 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
707 }
708 brelse(bp);
709 } while (error == 0 && uio->uio_resid > 0 && n > 0);
710 return (error);
711}
712
713/*
714 * Vnode op for write using bio
715 */
716int
717nfs_write(ap)
718 struct vop_write_args /* {
719 struct vnode *a_vp;
720 struct uio *a_uio;
721 int a_ioflag;
722 struct ucred *a_cred;
723 } */ *ap;
724{
725 int biosize;
726 struct uio *uio = ap->a_uio;
727 struct proc *p = uio->uio_procp;
728 struct vnode *vp = ap->a_vp;
729 struct nfsnode *np = VTONFS(vp);
730 struct ucred *cred = ap->a_cred;
731 int ioflag = ap->a_ioflag;
732 struct buf *bp;
733 struct vattr vattr;
734 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
735 daddr_t lbn;
736 int bcount;
737 int n, on, error = 0, iomode, must_commit;
738 int haverslock = 0;
739
740#ifdef DIAGNOSTIC
741 if (uio->uio_rw != UIO_WRITE)
742 panic("nfs_write mode");
743 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
744 panic("nfs_write proc");
745#endif
746 if (vp->v_type != VREG)
747 return (EIO);
748 if (np->n_flag & NWRITEERR) {
749 np->n_flag &= ~NWRITEERR;
750 return (np->n_error);
751 }
752 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
753 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
754 (void)nfs_fsinfo(nmp, vp, cred, p);
755
756 /*
757 * Synchronously flush pending buffers if we are in synchronous
758 * mode or if we are appending.
759 */
760 if (ioflag & (IO_APPEND | IO_SYNC)) {
761 if (np->n_flag & NMODIFIED) {
762 np->n_attrstamp = 0;
763 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
764 if (error)
765 return (error);
766 }
767 }
768
769 /*
770 * If IO_APPEND then load uio_offset. We restart here if we cannot
771 * get the append lock.
772 */
773restart:
774 if (ioflag & IO_APPEND) {
775 np->n_attrstamp = 0;
776 error = VOP_GETATTR(vp, &vattr, cred, p);
777 if (error)
778 return (error);
779 uio->uio_offset = np->n_size;
780 }
781
782 if (uio->uio_offset < 0)
783 return (EINVAL);
784 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
785 return (EFBIG);
786 if (uio->uio_resid == 0)
787 return (0);
788
789 /*
790 * We need to obtain the rslock if we intend to modify np->n_size
791 * in order to guarentee the append point with multiple contending
792 * writers, to guarentee that no other appenders modify n_size
793 * while we are trying to obtain a truncated buffer (i.e. to avoid
794 * accidently truncating data written by another appender due to
795 * the race), and to ensure that the buffer is populated prior to
796 * our extending of the file. We hold rslock through the entire
797 * operation.
798 *
799 * Note that we do not synchronize the case where someone truncates
800 * the file while we are appending to it because attempting to lock
801 * this case may deadlock other parts of the system unexpectedly.
802 */
803 if ((ioflag & IO_APPEND) ||
804 uio->uio_offset + uio->uio_resid > np->n_size) {
805 switch(nfs_rslock(np, p)) {
806 case ENOLCK:
807 goto restart;
808 /* not reached */
809 case EINTR:
810 case ERESTART:
811 return(EINTR);
812 /* not reached */
813 default:
814 break;
815 }
816 haverslock = 1;
817 }
818
819 /*
820 * Maybe this should be above the vnode op call, but so long as
821 * file servers have no limits, i don't think it matters
822 */
823 if (p && uio->uio_offset + uio->uio_resid >
824 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
825 psignal(p, SIGXFSZ);
826 if (haverslock)
827 nfs_rsunlock(np, p);
828 return (EFBIG);
829 }
830
831 biosize = vp->v_mount->mnt_stat.f_iosize;
832
833 do {
834 /*
835 * Check for a valid write lease.
836 */
837 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
838 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
839 do {
840 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
841 } while (error == NQNFS_EXPIRED);
842 if (error)
843 break;
844 if (np->n_lrev != np->n_brev ||
845 (np->n_flag & NQNFSNONCACHE)) {
846 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
847 if (error)
848 break;
849 np->n_brev = np->n_lrev;
850 }
851 }
852 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
853 iomode = NFSV3WRITE_FILESYNC;
854 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
855 if (must_commit)
856 nfs_clearcommit(vp->v_mount);
857 break;
858 }
859 nfsstats.biocache_writes++;
860 lbn = uio->uio_offset / biosize;
861 on = uio->uio_offset & (biosize-1);
862 n = min((unsigned)(biosize - on), uio->uio_resid);
863again:
864 /*
865 * Handle direct append and file extension cases, calculate
866 * unaligned buffer size.
867 */
868
869 if (uio->uio_offset == np->n_size && n) {
870 /*
871 * Get the buffer (in its pre-append state to maintain
872 * B_CACHE if it was previously set). Resize the
873 * nfsnode after we have locked the buffer to prevent
874 * readers from reading garbage.
875 */
876 bcount = on;
877 bp = nfs_getcacheblk(vp, lbn, bcount, p);
878
879 if (bp != NULL) {
880 long save;
881
882 np->n_size = uio->uio_offset + n;
883 np->n_flag |= NMODIFIED;
884 vnode_pager_setsize(vp, np->n_size);
885
886 save = bp->b_flags & B_CACHE;
887 bcount += n;
888 allocbuf(bp, bcount);
889 bp->b_flags |= save;
890 }
891 } else {
892 /*
893 * Obtain the locked cache block first, and then
894 * adjust the file's size as appropriate.
895 */
896 bcount = on + n;
897 if ((off_t)lbn * biosize + bcount < np->n_size) {
898 if ((off_t)(lbn + 1) * biosize < np->n_size)
899 bcount = biosize;
900 else
901 bcount = np->n_size - (off_t)lbn * biosize;
902 }
903
904 bp = nfs_getcacheblk(vp, lbn, bcount, p);
905
906 if (uio->uio_offset + n > np->n_size) {
907 np->n_size = uio->uio_offset + n;
908 np->n_flag |= NMODIFIED;
909 vnode_pager_setsize(vp, np->n_size);
910 }
911 }
912
913 if (!bp) {
914 error = EINTR;
915 break;
916 }
917
918 /*
919 * Issue a READ if B_CACHE is not set. In special-append
920 * mode, B_CACHE is based on the buffer prior to the write
921 * op and is typically set, avoiding the read. If a read
922 * is required in special append mode, the server will
923 * probably send us a short-read since we extended the file
924 * on our end, resulting in b_resid == 0 and, thusly,
925 * B_CACHE getting set.
926 *
927 * We can also avoid issuing the read if the write covers
928 * the entire buffer. We have to make sure the buffer state
929 * is reasonable in this case since we will not be initiating
930 * I/O. See the comments in kern/vfs_bio.c's getblk() for
931 * more information.
932 *
933 * B_CACHE may also be set due to the buffer being cached
934 * normally.
935 */
936
937 if (on == 0 && n == bcount) {
938 bp->b_flags |= B_CACHE;
939 bp->b_flags &= ~B_INVAL;
940 bp->b_ioflags &= ~BIO_ERROR;
941 }
942
943 if ((bp->b_flags & B_CACHE) == 0) {
944 bp->b_iocmd = BIO_READ;
945 vfs_busy_pages(bp, 0);
946 error = nfs_doio(bp, cred, p);
947 if (error) {
948 brelse(bp);
949 break;
950 }
951 }
952 if (!bp) {
953 error = EINTR;
954 break;
955 }
956 if (bp->b_wcred == NOCRED) {
957 crhold(cred);
958 bp->b_wcred = cred;
959 }
960 np->n_flag |= NMODIFIED;
961
962 /*
963 * If dirtyend exceeds file size, chop it down. This should
964 * not normally occur but there is an append race where it
965 * might occur XXX, so we log it.
966 *
967 * If the chopping creates a reverse-indexed or degenerate
968 * situation with dirtyoff/end, we 0 both of them.
969 */
970
971 if (bp->b_dirtyend > bcount) {
972 printf("NFS append race @%lx:%d\n",
973 (long)bp->b_blkno * DEV_BSIZE,
974 bp->b_dirtyend - bcount);
975 bp->b_dirtyend = bcount;
976 }
977
978 if (bp->b_dirtyoff >= bp->b_dirtyend)
979 bp->b_dirtyoff = bp->b_dirtyend = 0;
980
981 /*
982 * If the new write will leave a contiguous dirty
983 * area, just update the b_dirtyoff and b_dirtyend,
984 * otherwise force a write rpc of the old dirty area.
985 *
986 * While it is possible to merge discontiguous writes due to
987 * our having a B_CACHE buffer ( and thus valid read data
988 * for the hole), we don't because it could lead to
989 * significant cache coherency problems with multiple clients,
990 * especially if locking is implemented later on.
991 *
992 * as an optimization we could theoretically maintain
993 * a linked list of discontinuous areas, but we would still
994 * have to commit them separately so there isn't much
995 * advantage to it except perhaps a bit of asynchronization.
996 */
997
998 if (bp->b_dirtyend > 0 &&
999 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
1000 if (BUF_WRITE(bp) == EINTR)
1001 return (EINTR);
1002 goto again;
1003 }
1004
1005 /*
1006 * Check for valid write lease and get one as required.
1007 * In case getblk() and/or bwrite() delayed us.
1008 */
1009 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
1010 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1011 do {
1012 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1013 } while (error == NQNFS_EXPIRED);
1014 if (error) {
1015 brelse(bp);
1016 break;
1017 }
1018 if (np->n_lrev != np->n_brev ||
1019 (np->n_flag & NQNFSNONCACHE)) {
1020 brelse(bp);
1021 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1022 if (error)
1023 break;
1024 np->n_brev = np->n_lrev;
1025 goto again;
1026 }
1027 }
1028
1029 error = uiomove((char *)bp->b_data + on, n, uio);
1030
1031 /*
1032 * Since this block is being modified, it must be written
1033 * again and not just committed. Since write clustering does
1034 * not work for the stage 1 data write, only the stage 2
1035 * commit rpc, we have to clear B_CLUSTEROK as well.
1036 */
1037 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1038
1039 if (error) {
1040 bp->b_ioflags |= BIO_ERROR;
1041 brelse(bp);
1042 break;
1043 }
1044
1045 /*
1046 * Only update dirtyoff/dirtyend if not a degenerate
1047 * condition.
1048 */
1049 if (n) {
1050 if (bp->b_dirtyend > 0) {
1051 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
1052 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
1053 } else {
1054 bp->b_dirtyoff = on;
1055 bp->b_dirtyend = on + n;
1056 }
1057 vfs_bio_set_validclean(bp, on, n);
1058 }
1059
1060 /*
1061 * If the lease is non-cachable or IO_SYNC do bwrite().
1062 *
1063 * IO_INVAL appears to be unused. The idea appears to be
1064 * to turn off caching in this case. Very odd. XXX
1065 */
1066 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
1067 if (ioflag & IO_INVAL)
1068 bp->b_flags |= B_NOCACHE;
1069 error = BUF_WRITE(bp);
1070 if (error)
1071 break;
1072 if (np->n_flag & NQNFSNONCACHE) {
1073 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1074 if (error)
1075 break;
1076 }
1077 } else if ((n + on) == biosize &&
1078 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
1079 bp->b_flags |= B_ASYNC;
1080 (void)nfs_writebp(bp, 0, 0);
1081 } else {
1082 bdwrite(bp);
1083 }
1084 } while (uio->uio_resid > 0 && n > 0);
1085
1086 if (haverslock)
1087 nfs_rsunlock(np, p);
1088
1089 return (error);
1090}
1091
1092/*
1093 * Get an nfs cache block.
1094 *
1095 * Allocate a new one if the block isn't currently in the cache
1096 * and return the block marked busy. If the calling process is
1097 * interrupted by a signal for an interruptible mount point, return
1098 * NULL.
1099 *
1100 * The caller must carefully deal with the possible B_INVAL state of
1101 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
1102 * indirectly), so synchronous reads can be issued without worrying about
1103 * the B_INVAL state. We have to be a little more careful when dealing
1104 * with writes (see comments in nfs_write()) when extending a file past
1105 * its EOF.
1106 */
1107static struct buf *
1108nfs_getcacheblk(vp, bn, size, p)
1109 struct vnode *vp;
1110 daddr_t bn;
1111 int size;
1112 struct proc *p;
1113{
1114 register struct buf *bp;
1115 struct mount *mp;
1116 struct nfsmount *nmp;
1117
1118 mp = vp->v_mount;
1119 nmp = VFSTONFS(mp);
1120
1121 if (nmp->nm_flag & NFSMNT_INT) {
1122 bp = getblk(vp, bn, size, PCATCH, 0);
1123 while (bp == (struct buf *)0) {
1124 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
1125 return ((struct buf *)0);
1126 bp = getblk(vp, bn, size, 0, 2 * hz);
1127 }
1128 } else {
1129 bp = getblk(vp, bn, size, 0, 0);
1130 }
1131
1132 if (vp->v_type == VREG) {
1133 int biosize;
1134
1135 biosize = mp->mnt_stat.f_iosize;
1136 bp->b_blkno = bn * (biosize / DEV_BSIZE);
1137 }
1138 return (bp);
1139}
1140
1141/*
1142 * Flush and invalidate all dirty buffers. If another process is already
1143 * doing the flush, just wait for completion.
1144 */
1145int
1146nfs_vinvalbuf(vp, flags, cred, p, intrflg)
1147 struct vnode *vp;
1148 int flags;
1149 struct ucred *cred;
1150 struct proc *p;
1151 int intrflg;
1152{
1153 register struct nfsnode *np = VTONFS(vp);
1154 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1155 int error = 0, slpflag, slptimeo;
1156
1157 if (vp->v_flag & VXLOCK) {
1158 return (0);
1159 }
1160
1161 if ((nmp->nm_flag & NFSMNT_INT) == 0)
1162 intrflg = 0;
1163 if (intrflg) {
1164 slpflag = PCATCH;
1165 slptimeo = 2 * hz;
1166 } else {
1167 slpflag = 0;
1168 slptimeo = 0;
1169 }
1170 /*
1171 * First wait for any other process doing a flush to complete.
1172 */
1173 while (np->n_flag & NFLUSHINPROG) {
1174 np->n_flag |= NFLUSHWANT;
1175 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
1176 slptimeo);
1177 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
1178 return (EINTR);
1179 }
1180
1181 /*
1182 * Now, flush as required.
1183 */
1184 np->n_flag |= NFLUSHINPROG;
1185 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
1186 while (error) {
1187 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
1188 np->n_flag &= ~NFLUSHINPROG;
1189 if (np->n_flag & NFLUSHWANT) {
1190 np->n_flag &= ~NFLUSHWANT;
1191 wakeup((caddr_t)&np->n_flag);
1192 }
1193 return (EINTR);
1194 }
1195 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
1196 }
1197 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
1198 if (np->n_flag & NFLUSHWANT) {
1199 np->n_flag &= ~NFLUSHWANT;
1200 wakeup((caddr_t)&np->n_flag);
1201 }
1202 return (0);
1203}
1204
1205/*
1206 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
1207 * This is mainly to avoid queueing async I/O requests when the nfsiods
1208 * are all hung on a dead server.
1209 *
1210 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
1211 * is eventually dequeued by the async daemon, nfs_doio() *will*.
1212 */
1213int
1214nfs_asyncio(bp, cred, procp)
1215 register struct buf *bp;
1216 struct ucred *cred;
1217 struct proc *procp;
1218{
1219 struct nfsmount *nmp;
1220 int i;
1221 int gotiod;
1222 int slpflag = 0;
1223 int slptimeo = 0;
1224 int error;
1225
1226 /*
1227 * If no async daemons then return EIO to force caller to run the rpc
1228 * synchronously.
1229 */
1230 if (nfs_numasync == 0)
1231 return (EIO);
1232
1233 nmp = VFSTONFS(bp->b_vp->v_mount);
1234
1235 /*
1236 * Commits are usually short and sweet so lets save some cpu and
1237 * leave the async daemons for more important rpc's (such as reads
1238 * and writes).
1239 */
1240 if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
1241 (nmp->nm_bufqiods > nfs_numasync / 2)) {
1242 return(EIO);
1243 }
1244
1245again:
1246 if (nmp->nm_flag & NFSMNT_INT)
1247 slpflag = PCATCH;
1248 gotiod = FALSE;
1249
1250 /*
1251 * Find a free iod to process this request.
1252 */
1253 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
1254 if (nfs_iodwant[i]) {
1255 /*
1256 * Found one, so wake it up and tell it which
1257 * mount to process.
1258 */
1259 NFS_DPF(ASYNCIO,
1260 ("nfs_asyncio: waking iod %d for mount %p\n",
1261 i, nmp));
1262 nfs_iodwant[i] = (struct proc *)0;
1263 nfs_iodmount[i] = nmp;
1264 nmp->nm_bufqiods++;
1265 wakeup((caddr_t)&nfs_iodwant[i]);
1266 gotiod = TRUE;
1267 break;
1268 }
1269
1270 /*
1271 * If none are free, we may already have an iod working on this mount
1272 * point. If so, it will process our request.
1273 */
1274 if (!gotiod) {
1275 if (nmp->nm_bufqiods > 0) {
1276 NFS_DPF(ASYNCIO,
1277 ("nfs_asyncio: %d iods are already processing mount %p\n",
1278 nmp->nm_bufqiods, nmp));
1279 gotiod = TRUE;
1280 }
1281 }
1282
1283 /*
1284 * If we have an iod which can process the request, then queue
1285 * the buffer.
1286 */
1287 if (gotiod) {
1288 /*
1289 * Ensure that the queue never grows too large. We still want
1290 * to asynchronize so we block rather then return EIO.
1291 */
1292 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1293 NFS_DPF(ASYNCIO,
1294 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1295 nmp->nm_bufqwant = TRUE;
1296 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1297 "nfsaio", slptimeo);
1298 if (error) {
1299 if (nfs_sigintr(nmp, NULL, procp))
1300 return (EINTR);
1301 if (slpflag == PCATCH) {
1302 slpflag = 0;
1303 slptimeo = 2 * hz;
1304 }
1305 }
1306 /*
1307 * We might have lost our iod while sleeping,
1308 * so check and loop if nescessary.
1309 */
1310 if (nmp->nm_bufqiods == 0) {
1311 NFS_DPF(ASYNCIO,
1312 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1313 goto again;
1314 }
1315 }
1316
1317 if (bp->b_iocmd == BIO_READ) {
1318 if (bp->b_rcred == NOCRED && cred != NOCRED) {
1319 crhold(cred);
1320 bp->b_rcred = cred;
1321 }
1322 } else {
1323 bp->b_flags |= B_WRITEINPROG;
1324 if (bp->b_wcred == NOCRED && cred != NOCRED) {
1325 crhold(cred);
1326 bp->b_wcred = cred;
1327 }
1328 }
1329
1330 BUF_KERNPROC(bp);
1331 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1332 nmp->nm_bufqlen++;
1333 return (0);
1334 }
1335
1336 /*
1337 * All the iods are busy on other mounts, so return EIO to
1338 * force the caller to process the i/o synchronously.
1339 */
1340 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1341 return (EIO);
1342}
1343
1344/*
1345 * Do an I/O operation to/from a cache block. This may be called
1346 * synchronously or from an nfsiod.
1347 */
1348int
1349nfs_doio(bp, cr, p)
1350 struct buf *bp;
1351 struct ucred *cr;
1352 struct proc *p;
1353{
1354 struct uio *uiop;
1355 struct vnode *vp;
1356 struct nfsnode *np;
1357 struct nfsmount *nmp;
1358 int error = 0, iomode, must_commit = 0;
1359 struct uio uio;
1360 struct iovec io;
1361
1362 vp = bp->b_vp;
1363 np = VTONFS(vp);
1364 nmp = VFSTONFS(vp->v_mount);
1365 uiop = &uio;
1366 uiop->uio_iov = &io;
1367 uiop->uio_iovcnt = 1;
1368 uiop->uio_segflg = UIO_SYSSPACE;
1369 uiop->uio_procp = p;
1370
1371 /*
1372 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We
1373 * do this here so we do not have to do it in all the code that
1374 * calls us.
1375 */
1376 bp->b_flags &= ~B_INVAL;
1377 bp->b_ioflags &= ~BIO_ERROR;
1378
1379 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
1380
1381 /*
1382 * Historically, paging was done with physio, but no more.
1383 */
1384 if (bp->b_flags & B_PHYS) {
1385 /*
1386 * ...though reading /dev/drum still gets us here.
1387 */
1388 io.iov_len = uiop->uio_resid = bp->b_bcount;
1389 /* mapping was done by vmapbuf() */
1390 io.iov_base = bp->b_data;
1391 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1392 if (bp->b_iocmd == BIO_READ) {
1393 uiop->uio_rw = UIO_READ;
1394 nfsstats.read_physios++;
1395 error = nfs_readrpc(vp, uiop, cr);
1396 } else {
1397 int com;
1398
1399 iomode = NFSV3WRITE_DATASYNC;
1400 uiop->uio_rw = UIO_WRITE;
1401 nfsstats.write_physios++;
1402 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1403 }
1404 if (error) {
1405 bp->b_ioflags |= BIO_ERROR;
1406 bp->b_error = error;
1407 }
1408 } else if (bp->b_iocmd == BIO_READ) {
1409 io.iov_len = uiop->uio_resid = bp->b_bcount;
1410 io.iov_base = bp->b_data;
1411 uiop->uio_rw = UIO_READ;
1412 switch (vp->v_type) {
1413 case VREG:
1414 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1415 nfsstats.read_bios++;
1416 error = nfs_readrpc(vp, uiop, cr);
1417 if (!error) {
1418 if (uiop->uio_resid) {
1419 /*
1420 * If we had a short read with no error, we must have
1421 * hit a file hole. We should zero-fill the remainder.
1422 * This can also occur if the server hits the file EOF.
1423 *
1424 * Holes used to be able to occur due to pending
1425 * writes, but that is not possible any longer.
1426 */
1427 int nread = bp->b_bcount - uiop->uio_resid;
1428 int left = bp->b_bcount - nread;
1429
1430 if (left > 0)
1431 bzero((char *)bp->b_data + nread, left);
1432 uiop->uio_resid = 0;
1433 }
1434 }
1435 if (p && (vp->v_flag & VTEXT) &&
1436 (((nmp->nm_flag & NFSMNT_NQNFS) &&
1437 NQNFS_CKINVALID(vp, np, ND_READ) &&
1438 np->n_lrev != np->n_brev) ||
1439 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1440 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1441 uprintf("Process killed due to text file modification\n");
1442 psignal(p, SIGKILL);
1443 PHOLD(p);
1444 }
1445 break;
1446 case VLNK:
1447 uiop->uio_offset = (off_t)0;
1448 nfsstats.readlink_bios++;
1449 error = nfs_readlinkrpc(vp, uiop, cr);
1450 break;
1451 case VDIR:
1452 nfsstats.readdir_bios++;
1453 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1454 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1455 error = nfs_readdirplusrpc(vp, uiop, cr);
1456 if (error == NFSERR_NOTSUPP)
1457 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1458 }
1459 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1460 error = nfs_readdirrpc(vp, uiop, cr);
1461 /*
1462 * end-of-directory sets B_INVAL but does not generate an
1463 * error.
1464 */
1465 if (error == 0 && uiop->uio_resid == bp->b_bcount)
1466 bp->b_flags |= B_INVAL;
1467 break;
1468 default:
1469 printf("nfs_doio: type %x unexpected\n",vp->v_type);
1470 break;
1471 };
1472 if (error) {
1473 bp->b_ioflags |= BIO_ERROR;
1474 bp->b_error = error;
1475 }
1476 } else {
1477 /*
1478 * If we only need to commit, try to commit
1479 */
1480 if (bp->b_flags & B_NEEDCOMMIT) {
1481 int retv;
1482 off_t off;
1483
1484 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
1485 bp->b_flags |= B_WRITEINPROG;
1486 retv = nfs_commit(
1487 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
1488 bp->b_wcred, p);
1489 bp->b_flags &= ~B_WRITEINPROG;
1490 if (retv == 0) {
1491 bp->b_dirtyoff = bp->b_dirtyend = 0;
1492 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1493 bp->b_resid = 0;
1494 bufdone(bp);
1495 return (0);
1496 }
1497 if (retv == NFSERR_STALEWRITEVERF) {
1498 nfs_clearcommit(bp->b_vp->v_mount);
1499 }
1500 }
1501
1502 /*
1503 * Setup for actual write
1504 */
1505
1506 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1507 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1508
1509 if (bp->b_dirtyend > bp->b_dirtyoff) {
1510 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1511 - bp->b_dirtyoff;
1512 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
1513 + bp->b_dirtyoff;
1514 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1515 uiop->uio_rw = UIO_WRITE;
1516 nfsstats.write_bios++;
1517
1518 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1519 iomode = NFSV3WRITE_UNSTABLE;
1520 else
1521 iomode = NFSV3WRITE_FILESYNC;
1522
1523 bp->b_flags |= B_WRITEINPROG;
1524 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1525
1526 /*
1527 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1528 * to cluster the buffers needing commit. This will allow
1529 * the system to submit a single commit rpc for the whole
1530 * cluster. We can do this even if the buffer is not 100%
1531 * dirty (relative to the NFS blocksize), so we optimize the
1532 * append-to-file-case.
1533 *
1534 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1535 * cleared because write clustering only works for commit
1536 * rpc's, not for the data portion of the write).
1537 */
1538
1539 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1540 bp->b_flags |= B_NEEDCOMMIT;
1541 if (bp->b_dirtyoff == 0
1542 && bp->b_dirtyend == bp->b_bcount)
1543 bp->b_flags |= B_CLUSTEROK;
1544 } else {
1545 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1546 }
1547 bp->b_flags &= ~B_WRITEINPROG;
1548
1549 /*
1550 * For an interrupted write, the buffer is still valid
1551 * and the write hasn't been pushed to the server yet,
1552 * so we can't set BIO_ERROR and report the interruption
1553 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1554 * is not relevant, so the rpc attempt is essentially
1555 * a noop. For the case of a V3 write rpc not being
1556 * committed to stable storage, the block is still
1557 * dirty and requires either a commit rpc or another
1558 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1559 * the block is reused. This is indicated by setting
1560 * the B_DELWRI and B_NEEDCOMMIT flags.
1561 *
1562 * If the buffer is marked B_PAGING, it does not reside on
1563 * the vp's paging queues so we cannot call bdirty(). The
1564 * bp in this case is not an NFS cache block so we should
1565 * be safe. XXX
1566 */
1567 if (error == EINTR
1568 || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1569 int s;
1570
1571 s = splbio();
1572 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1573 if ((bp->b_flags & B_PAGING) == 0) {
1574 bdirty(bp);
1575 bp->b_flags &= ~B_DONE;
1576 }
1577 if (error && (bp->b_flags & B_ASYNC) == 0)
1578 bp->b_flags |= B_EINTR;
1579 splx(s);
1580 } else {
1581 if (error) {
1582 bp->b_ioflags |= BIO_ERROR;
1583 bp->b_error = np->n_error = error;
1584 np->n_flag |= NWRITEERR;
1585 }
1586 bp->b_dirtyoff = bp->b_dirtyend = 0;
1587 }
1588 } else {
1589 bp->b_resid = 0;
1590 bufdone(bp);
1591 return (0);
1592 }
1593 }
1594 bp->b_resid = uiop->uio_resid;
1595 if (must_commit)
1596 nfs_clearcommit(vp->v_mount);
1597 bufdone(bp);
1598 return (error);
1599}
47#include <sys/buf.h>
48#include <sys/vnode.h>
49#include <sys/mount.h>
50#include <sys/kernel.h>
51
52#include <vm/vm.h>
53#include <vm/vm_extern.h>
54#include <vm/vm_page.h>
55#include <vm/vm_object.h>
56#include <vm/vm_pager.h>
57#include <vm/vnode_pager.h>
58
59#include <nfs/rpcv2.h>
60#include <nfs/nfsproto.h>
61#include <nfs/nfs.h>
62#include <nfs/nfsmount.h>
63#include <nfs/nqnfs.h>
64#include <nfs/nfsnode.h>
65
66static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
67 struct proc *p));
68
69extern int nfs_numasync;
70extern int nfs_pbuf_freecnt;
71extern struct nfsstats nfsstats;
72
73/*
74 * Vnode op for VM getpages.
75 */
76int
77nfs_getpages(ap)
78 struct vop_getpages_args /* {
79 struct vnode *a_vp;
80 vm_page_t *a_m;
81 int a_count;
82 int a_reqpage;
83 vm_ooffset_t a_offset;
84 } */ *ap;
85{
86 int i, error, nextoff, size, toff, count, npages;
87 struct uio uio;
88 struct iovec iov;
89 vm_offset_t kva;
90 struct buf *bp;
91 struct vnode *vp;
92 struct proc *p;
93 struct ucred *cred;
94 struct nfsmount *nmp;
95 vm_page_t *pages;
96
97 vp = ap->a_vp;
98 p = curproc; /* XXX */
99 cred = curproc->p_ucred; /* XXX */
100 nmp = VFSTONFS(vp->v_mount);
101 pages = ap->a_m;
102 count = ap->a_count;
103
104 if (vp->v_object == NULL) {
105 printf("nfs_getpages: called with non-merged cache vnode??\n");
106 return VM_PAGER_ERROR;
107 }
108
109 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
110 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
111 (void)nfs_fsinfo(nmp, vp, cred, p);
112
113 npages = btoc(count);
114
115 /*
116 * If the requested page is partially valid, just return it and
117 * allow the pager to zero-out the blanks. Partially valid pages
118 * can only occur at the file EOF.
119 */
120
121 {
122 vm_page_t m = pages[ap->a_reqpage];
123
124 if (m->valid != 0) {
125 /* handled by vm_fault now */
126 /* vm_page_zero_invalid(m, TRUE); */
127 for (i = 0; i < npages; ++i) {
128 if (i != ap->a_reqpage)
129 vnode_pager_freepage(pages[i]);
130 }
131 return(0);
132 }
133 }
134
135 /*
136 * We use only the kva address for the buffer, but this is extremely
137 * convienient and fast.
138 */
139 bp = getpbuf(&nfs_pbuf_freecnt);
140
141 kva = (vm_offset_t) bp->b_data;
142 pmap_qenter(kva, pages, npages);
143
144 iov.iov_base = (caddr_t) kva;
145 iov.iov_len = count;
146 uio.uio_iov = &iov;
147 uio.uio_iovcnt = 1;
148 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
149 uio.uio_resid = count;
150 uio.uio_segflg = UIO_SYSSPACE;
151 uio.uio_rw = UIO_READ;
152 uio.uio_procp = p;
153
154 error = nfs_readrpc(vp, &uio, cred);
155 pmap_qremove(kva, npages);
156
157 relpbuf(bp, &nfs_pbuf_freecnt);
158
159 if (error && (uio.uio_resid == count)) {
160 printf("nfs_getpages: error %d\n", error);
161 for (i = 0; i < npages; ++i) {
162 if (i != ap->a_reqpage)
163 vnode_pager_freepage(pages[i]);
164 }
165 return VM_PAGER_ERROR;
166 }
167
168 /*
169 * Calculate the number of bytes read and validate only that number
170 * of bytes. Note that due to pending writes, size may be 0. This
171 * does not mean that the remaining data is invalid!
172 */
173
174 size = count - uio.uio_resid;
175
176 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
177 vm_page_t m;
178 nextoff = toff + PAGE_SIZE;
179 m = pages[i];
180
181 m->flags &= ~PG_ZERO;
182
183 if (nextoff <= size) {
184 /*
185 * Read operation filled an entire page
186 */
187 m->valid = VM_PAGE_BITS_ALL;
188 vm_page_undirty(m);
189 } else if (size > toff) {
190 /*
191 * Read operation filled a partial page.
192 */
193 m->valid = 0;
194 vm_page_set_validclean(m, 0, size - toff);
195 /* handled by vm_fault now */
196 /* vm_page_zero_invalid(m, TRUE); */
197 }
198
199 if (i != ap->a_reqpage) {
200 /*
201 * Whether or not to leave the page activated is up in
202 * the air, but we should put the page on a page queue
203 * somewhere (it already is in the object). Result:
204 * It appears that emperical results show that
205 * deactivating pages is best.
206 */
207
208 /*
209 * Just in case someone was asking for this page we
210 * now tell them that it is ok to use.
211 */
212 if (!error) {
213 if (m->flags & PG_WANTED)
214 vm_page_activate(m);
215 else
216 vm_page_deactivate(m);
217 vm_page_wakeup(m);
218 } else {
219 vnode_pager_freepage(m);
220 }
221 }
222 }
223 return 0;
224}
225
226/*
227 * Vnode op for VM putpages.
228 */
229int
230nfs_putpages(ap)
231 struct vop_putpages_args /* {
232 struct vnode *a_vp;
233 vm_page_t *a_m;
234 int a_count;
235 int a_sync;
236 int *a_rtvals;
237 vm_ooffset_t a_offset;
238 } */ *ap;
239{
240 struct uio uio;
241 struct iovec iov;
242 vm_offset_t kva;
243 struct buf *bp;
244 int iomode, must_commit, i, error, npages, count;
245 off_t offset;
246 int *rtvals;
247 struct vnode *vp;
248 struct proc *p;
249 struct ucred *cred;
250 struct nfsmount *nmp;
251 struct nfsnode *np;
252 vm_page_t *pages;
253
254 vp = ap->a_vp;
255 np = VTONFS(vp);
256 p = curproc; /* XXX */
257 cred = curproc->p_ucred; /* XXX */
258 nmp = VFSTONFS(vp->v_mount);
259 pages = ap->a_m;
260 count = ap->a_count;
261 rtvals = ap->a_rtvals;
262 npages = btoc(count);
263 offset = IDX_TO_OFF(pages[0]->pindex);
264
265 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
266 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
267 (void)nfs_fsinfo(nmp, vp, cred, p);
268
269 for (i = 0; i < npages; i++) {
270 rtvals[i] = VM_PAGER_AGAIN;
271 }
272
273 /*
274 * When putting pages, do not extend file past EOF.
275 */
276
277 if (offset + count > np->n_size) {
278 count = np->n_size - offset;
279 if (count < 0)
280 count = 0;
281 }
282
283 /*
284 * We use only the kva address for the buffer, but this is extremely
285 * convienient and fast.
286 */
287 bp = getpbuf(&nfs_pbuf_freecnt);
288
289 kva = (vm_offset_t) bp->b_data;
290 pmap_qenter(kva, pages, npages);
291
292 iov.iov_base = (caddr_t) kva;
293 iov.iov_len = count;
294 uio.uio_iov = &iov;
295 uio.uio_iovcnt = 1;
296 uio.uio_offset = offset;
297 uio.uio_resid = count;
298 uio.uio_segflg = UIO_SYSSPACE;
299 uio.uio_rw = UIO_WRITE;
300 uio.uio_procp = p;
301
302 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
303 iomode = NFSV3WRITE_UNSTABLE;
304 else
305 iomode = NFSV3WRITE_FILESYNC;
306
307 error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
308
309 pmap_qremove(kva, npages);
310 relpbuf(bp, &nfs_pbuf_freecnt);
311
312 if (!error) {
313 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
314 for (i = 0; i < nwritten; i++) {
315 rtvals[i] = VM_PAGER_OK;
316 vm_page_undirty(pages[i]);
317 }
318 if (must_commit)
319 nfs_clearcommit(vp->v_mount);
320 }
321 return rtvals[0];
322}
323
324/*
325 * Vnode op for read using bio
326 */
327int
328nfs_bioread(vp, uio, ioflag, cred)
329 register struct vnode *vp;
330 register struct uio *uio;
331 int ioflag;
332 struct ucred *cred;
333{
334 register struct nfsnode *np = VTONFS(vp);
335 register int biosize, i;
336 struct buf *bp = 0, *rabp;
337 struct vattr vattr;
338 struct proc *p;
339 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
340 daddr_t lbn, rabn;
341 int bcount;
342 int seqcount;
343 int nra, error = 0, n = 0, on = 0;
344
345#ifdef DIAGNOSTIC
346 if (uio->uio_rw != UIO_READ)
347 panic("nfs_read mode");
348#endif
349 if (uio->uio_resid == 0)
350 return (0);
351 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */
352 return (EINVAL);
353 p = uio->uio_procp;
354
355 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
356 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
357 (void)nfs_fsinfo(nmp, vp, cred, p);
358 if (vp->v_type != VDIR &&
359 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
360 return (EFBIG);
361 biosize = vp->v_mount->mnt_stat.f_iosize;
362 seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE);
363 /*
364 * For nfs, cache consistency can only be maintained approximately.
365 * Although RFC1094 does not specify the criteria, the following is
366 * believed to be compatible with the reference port.
367 * For nqnfs, full cache consistency is maintained within the loop.
368 * For nfs:
369 * If the file's modify time on the server has changed since the
370 * last read rpc or you have written to the file,
371 * you may have lost data cache consistency with the
372 * server, so flush all of the file's data out of the cache.
373 * Then force a getattr rpc to ensure that you have up to date
374 * attributes.
375 * NB: This implies that cache data can be read when up to
376 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
377 * attributes this could be forced by setting n_attrstamp to 0 before
378 * the VOP_GETATTR() call.
379 */
380 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
381 if (np->n_flag & NMODIFIED) {
382 if (vp->v_type != VREG) {
383 if (vp->v_type != VDIR)
384 panic("nfs: bioread, not dir");
385 nfs_invaldir(vp);
386 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
387 if (error)
388 return (error);
389 }
390 np->n_attrstamp = 0;
391 error = VOP_GETATTR(vp, &vattr, cred, p);
392 if (error)
393 return (error);
394 np->n_mtime = vattr.va_mtime.tv_sec;
395 } else {
396 error = VOP_GETATTR(vp, &vattr, cred, p);
397 if (error)
398 return (error);
399 if (np->n_mtime != vattr.va_mtime.tv_sec) {
400 if (vp->v_type == VDIR)
401 nfs_invaldir(vp);
402 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
403 if (error)
404 return (error);
405 np->n_mtime = vattr.va_mtime.tv_sec;
406 }
407 }
408 }
409 do {
410
411 /*
412 * Get a valid lease. If cached data is stale, flush it.
413 */
414 if (nmp->nm_flag & NFSMNT_NQNFS) {
415 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
416 do {
417 error = nqnfs_getlease(vp, ND_READ, cred, p);
418 } while (error == NQNFS_EXPIRED);
419 if (error)
420 return (error);
421 if (np->n_lrev != np->n_brev ||
422 (np->n_flag & NQNFSNONCACHE) ||
423 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
424 if (vp->v_type == VDIR)
425 nfs_invaldir(vp);
426 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
427 if (error)
428 return (error);
429 np->n_brev = np->n_lrev;
430 }
431 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
432 nfs_invaldir(vp);
433 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
434 if (error)
435 return (error);
436 }
437 }
438 if (np->n_flag & NQNFSNONCACHE) {
439 switch (vp->v_type) {
440 case VREG:
441 return (nfs_readrpc(vp, uio, cred));
442 case VLNK:
443 return (nfs_readlinkrpc(vp, uio, cred));
444 case VDIR:
445 break;
446 default:
447 printf(" NQNFSNONCACHE: type %x unexpected\n",
448 vp->v_type);
449 };
450 }
451 switch (vp->v_type) {
452 case VREG:
453 nfsstats.biocache_reads++;
454 lbn = uio->uio_offset / biosize;
455 on = uio->uio_offset & (biosize - 1);
456
457 /*
458 * Start the read ahead(s), as required.
459 */
460 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
461 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
462 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
463 rabn = lbn + 1 + nra;
464 if (!incore(vp, rabn)) {
465 rabp = nfs_getcacheblk(vp, rabn, biosize, p);
466 if (!rabp)
467 return (EINTR);
468 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
469 rabp->b_flags |= B_ASYNC;
470 rabp->b_iocmd = BIO_READ;
471 vfs_busy_pages(rabp, 0);
472 if (nfs_asyncio(rabp, cred, p)) {
473 rabp->b_flags |= B_INVAL;
474 rabp->b_ioflags |= BIO_ERROR;
475 vfs_unbusy_pages(rabp);
476 brelse(rabp);
477 break;
478 }
479 } else {
480 brelse(rabp);
481 }
482 }
483 }
484 }
485
486 /*
487 * Obtain the buffer cache block. Figure out the buffer size
488 * when we are at EOF. If we are modifying the size of the
489 * buffer based on an EOF condition we need to hold
490 * nfs_rslock() through obtaining the buffer to prevent
491 * a potential writer-appender from messing with n_size.
492 * Otherwise we may accidently truncate the buffer and
493 * lose dirty data.
494 *
495 * Note that bcount is *not* DEV_BSIZE aligned.
496 */
497
498again:
499 bcount = biosize;
500 if ((off_t)lbn * biosize >= np->n_size) {
501 bcount = 0;
502 } else if ((off_t)(lbn + 1) * biosize > np->n_size) {
503 bcount = np->n_size - (off_t)lbn * biosize;
504 }
505 if (bcount != biosize) {
506 switch(nfs_rslock(np, p)) {
507 case ENOLCK:
508 goto again;
509 /* not reached */
510 case EINTR:
511 case ERESTART:
512 return(EINTR);
513 /* not reached */
514 default:
515 break;
516 }
517 }
518
519 bp = nfs_getcacheblk(vp, lbn, bcount, p);
520
521 if (bcount != biosize)
522 nfs_rsunlock(np, p);
523 if (!bp)
524 return (EINTR);
525
526 /*
527 * If B_CACHE is not set, we must issue the read. If this
528 * fails, we return an error.
529 */
530
531 if ((bp->b_flags & B_CACHE) == 0) {
532 bp->b_iocmd = BIO_READ;
533 vfs_busy_pages(bp, 0);
534 error = nfs_doio(bp, cred, p);
535 if (error) {
536 brelse(bp);
537 return (error);
538 }
539 }
540
541 /*
542 * on is the offset into the current bp. Figure out how many
543 * bytes we can copy out of the bp. Note that bcount is
544 * NOT DEV_BSIZE aligned.
545 *
546 * Then figure out how many bytes we can copy into the uio.
547 */
548
549 n = 0;
550 if (on < bcount)
551 n = min((unsigned)(bcount - on), uio->uio_resid);
552 break;
553 case VLNK:
554 nfsstats.biocache_readlinks++;
555 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
556 if (!bp)
557 return (EINTR);
558 if ((bp->b_flags & B_CACHE) == 0) {
559 bp->b_iocmd = BIO_READ;
560 vfs_busy_pages(bp, 0);
561 error = nfs_doio(bp, cred, p);
562 if (error) {
563 bp->b_ioflags |= BIO_ERROR;
564 brelse(bp);
565 return (error);
566 }
567 }
568 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
569 on = 0;
570 break;
571 case VDIR:
572 nfsstats.biocache_readdirs++;
573 if (np->n_direofoffset
574 && uio->uio_offset >= np->n_direofoffset) {
575 return (0);
576 }
577 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
578 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
579 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
580 if (!bp)
581 return (EINTR);
582 if ((bp->b_flags & B_CACHE) == 0) {
583 bp->b_iocmd = BIO_READ;
584 vfs_busy_pages(bp, 0);
585 error = nfs_doio(bp, cred, p);
586 if (error) {
587 brelse(bp);
588 }
589 while (error == NFSERR_BAD_COOKIE) {
590 printf("got bad cookie vp %p bp %p\n", vp, bp);
591 nfs_invaldir(vp);
592 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
593 /*
594 * Yuck! The directory has been modified on the
595 * server. The only way to get the block is by
596 * reading from the beginning to get all the
597 * offset cookies.
598 *
599 * Leave the last bp intact unless there is an error.
600 * Loop back up to the while if the error is another
601 * NFSERR_BAD_COOKIE (double yuch!).
602 */
603 for (i = 0; i <= lbn && !error; i++) {
604 if (np->n_direofoffset
605 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
606 return (0);
607 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
608 if (!bp)
609 return (EINTR);
610 if ((bp->b_flags & B_CACHE) == 0) {
611 bp->b_iocmd = BIO_READ;
612 vfs_busy_pages(bp, 0);
613 error = nfs_doio(bp, cred, p);
614 /*
615 * no error + B_INVAL == directory EOF,
616 * use the block.
617 */
618 if (error == 0 && (bp->b_flags & B_INVAL))
619 break;
620 }
621 /*
622 * An error will throw away the block and the
623 * for loop will break out. If no error and this
624 * is not the block we want, we throw away the
625 * block and go for the next one via the for loop.
626 */
627 if (error || i < lbn)
628 brelse(bp);
629 }
630 }
631 /*
632 * The above while is repeated if we hit another cookie
633 * error. If we hit an error and it wasn't a cookie error,
634 * we give up.
635 */
636 if (error)
637 return (error);
638 }
639
640 /*
641 * If not eof and read aheads are enabled, start one.
642 * (You need the current block first, so that you have the
643 * directory offset cookie of the next block.)
644 */
645 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
646 (bp->b_flags & B_INVAL) == 0 &&
647 (np->n_direofoffset == 0 ||
648 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
649 !(np->n_flag & NQNFSNONCACHE) &&
650 !incore(vp, lbn + 1)) {
651 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
652 if (rabp) {
653 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
654 rabp->b_flags |= B_ASYNC;
655 rabp->b_iocmd = BIO_READ;
656 vfs_busy_pages(rabp, 0);
657 if (nfs_asyncio(rabp, cred, p)) {
658 rabp->b_flags |= B_INVAL;
659 rabp->b_ioflags |= BIO_ERROR;
660 vfs_unbusy_pages(rabp);
661 brelse(rabp);
662 }
663 } else {
664 brelse(rabp);
665 }
666 }
667 }
668 /*
669 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
670 * chopped for the EOF condition, we cannot tell how large
671 * NFS directories are going to be until we hit EOF. So
672 * an NFS directory buffer is *not* chopped to its EOF. Now,
673 * it just so happens that b_resid will effectively chop it
674 * to EOF. *BUT* this information is lost if the buffer goes
675 * away and is reconstituted into a B_CACHE state ( due to
676 * being VMIO ) later. So we keep track of the directory eof
677 * in np->n_direofoffset and chop it off as an extra step
678 * right here.
679 */
680 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
681 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
682 n = np->n_direofoffset - uio->uio_offset;
683 break;
684 default:
685 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
686 break;
687 };
688
689 if (n > 0) {
690 error = uiomove(bp->b_data + on, (int)n, uio);
691 }
692 switch (vp->v_type) {
693 case VREG:
694 break;
695 case VLNK:
696 n = 0;
697 break;
698 case VDIR:
699 /*
700 * Invalidate buffer if caching is disabled, forcing a
701 * re-read from the remote later.
702 */
703 if (np->n_flag & NQNFSNONCACHE)
704 bp->b_flags |= B_INVAL;
705 break;
706 default:
707 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
708 }
709 brelse(bp);
710 } while (error == 0 && uio->uio_resid > 0 && n > 0);
711 return (error);
712}
713
714/*
715 * Vnode op for write using bio
716 */
717int
718nfs_write(ap)
719 struct vop_write_args /* {
720 struct vnode *a_vp;
721 struct uio *a_uio;
722 int a_ioflag;
723 struct ucred *a_cred;
724 } */ *ap;
725{
726 int biosize;
727 struct uio *uio = ap->a_uio;
728 struct proc *p = uio->uio_procp;
729 struct vnode *vp = ap->a_vp;
730 struct nfsnode *np = VTONFS(vp);
731 struct ucred *cred = ap->a_cred;
732 int ioflag = ap->a_ioflag;
733 struct buf *bp;
734 struct vattr vattr;
735 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
736 daddr_t lbn;
737 int bcount;
738 int n, on, error = 0, iomode, must_commit;
739 int haverslock = 0;
740
741#ifdef DIAGNOSTIC
742 if (uio->uio_rw != UIO_WRITE)
743 panic("nfs_write mode");
744 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
745 panic("nfs_write proc");
746#endif
747 if (vp->v_type != VREG)
748 return (EIO);
749 if (np->n_flag & NWRITEERR) {
750 np->n_flag &= ~NWRITEERR;
751 return (np->n_error);
752 }
753 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
754 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
755 (void)nfs_fsinfo(nmp, vp, cred, p);
756
757 /*
758 * Synchronously flush pending buffers if we are in synchronous
759 * mode or if we are appending.
760 */
761 if (ioflag & (IO_APPEND | IO_SYNC)) {
762 if (np->n_flag & NMODIFIED) {
763 np->n_attrstamp = 0;
764 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
765 if (error)
766 return (error);
767 }
768 }
769
770 /*
771 * If IO_APPEND then load uio_offset. We restart here if we cannot
772 * get the append lock.
773 */
774restart:
775 if (ioflag & IO_APPEND) {
776 np->n_attrstamp = 0;
777 error = VOP_GETATTR(vp, &vattr, cred, p);
778 if (error)
779 return (error);
780 uio->uio_offset = np->n_size;
781 }
782
783 if (uio->uio_offset < 0)
784 return (EINVAL);
785 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
786 return (EFBIG);
787 if (uio->uio_resid == 0)
788 return (0);
789
790 /*
791 * We need to obtain the rslock if we intend to modify np->n_size
792 * in order to guarentee the append point with multiple contending
793 * writers, to guarentee that no other appenders modify n_size
794 * while we are trying to obtain a truncated buffer (i.e. to avoid
795 * accidently truncating data written by another appender due to
796 * the race), and to ensure that the buffer is populated prior to
797 * our extending of the file. We hold rslock through the entire
798 * operation.
799 *
800 * Note that we do not synchronize the case where someone truncates
801 * the file while we are appending to it because attempting to lock
802 * this case may deadlock other parts of the system unexpectedly.
803 */
804 if ((ioflag & IO_APPEND) ||
805 uio->uio_offset + uio->uio_resid > np->n_size) {
806 switch(nfs_rslock(np, p)) {
807 case ENOLCK:
808 goto restart;
809 /* not reached */
810 case EINTR:
811 case ERESTART:
812 return(EINTR);
813 /* not reached */
814 default:
815 break;
816 }
817 haverslock = 1;
818 }
819
820 /*
821 * Maybe this should be above the vnode op call, but so long as
822 * file servers have no limits, i don't think it matters
823 */
824 if (p && uio->uio_offset + uio->uio_resid >
825 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
826 psignal(p, SIGXFSZ);
827 if (haverslock)
828 nfs_rsunlock(np, p);
829 return (EFBIG);
830 }
831
832 biosize = vp->v_mount->mnt_stat.f_iosize;
833
834 do {
835 /*
836 * Check for a valid write lease.
837 */
838 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
839 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
840 do {
841 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
842 } while (error == NQNFS_EXPIRED);
843 if (error)
844 break;
845 if (np->n_lrev != np->n_brev ||
846 (np->n_flag & NQNFSNONCACHE)) {
847 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
848 if (error)
849 break;
850 np->n_brev = np->n_lrev;
851 }
852 }
853 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
854 iomode = NFSV3WRITE_FILESYNC;
855 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
856 if (must_commit)
857 nfs_clearcommit(vp->v_mount);
858 break;
859 }
860 nfsstats.biocache_writes++;
861 lbn = uio->uio_offset / biosize;
862 on = uio->uio_offset & (biosize-1);
863 n = min((unsigned)(biosize - on), uio->uio_resid);
864again:
865 /*
866 * Handle direct append and file extension cases, calculate
867 * unaligned buffer size.
868 */
869
870 if (uio->uio_offset == np->n_size && n) {
871 /*
872 * Get the buffer (in its pre-append state to maintain
873 * B_CACHE if it was previously set). Resize the
874 * nfsnode after we have locked the buffer to prevent
875 * readers from reading garbage.
876 */
877 bcount = on;
878 bp = nfs_getcacheblk(vp, lbn, bcount, p);
879
880 if (bp != NULL) {
881 long save;
882
883 np->n_size = uio->uio_offset + n;
884 np->n_flag |= NMODIFIED;
885 vnode_pager_setsize(vp, np->n_size);
886
887 save = bp->b_flags & B_CACHE;
888 bcount += n;
889 allocbuf(bp, bcount);
890 bp->b_flags |= save;
891 }
892 } else {
893 /*
894 * Obtain the locked cache block first, and then
895 * adjust the file's size as appropriate.
896 */
897 bcount = on + n;
898 if ((off_t)lbn * biosize + bcount < np->n_size) {
899 if ((off_t)(lbn + 1) * biosize < np->n_size)
900 bcount = biosize;
901 else
902 bcount = np->n_size - (off_t)lbn * biosize;
903 }
904
905 bp = nfs_getcacheblk(vp, lbn, bcount, p);
906
907 if (uio->uio_offset + n > np->n_size) {
908 np->n_size = uio->uio_offset + n;
909 np->n_flag |= NMODIFIED;
910 vnode_pager_setsize(vp, np->n_size);
911 }
912 }
913
914 if (!bp) {
915 error = EINTR;
916 break;
917 }
918
919 /*
920 * Issue a READ if B_CACHE is not set. In special-append
921 * mode, B_CACHE is based on the buffer prior to the write
922 * op and is typically set, avoiding the read. If a read
923 * is required in special append mode, the server will
924 * probably send us a short-read since we extended the file
925 * on our end, resulting in b_resid == 0 and, thusly,
926 * B_CACHE getting set.
927 *
928 * We can also avoid issuing the read if the write covers
929 * the entire buffer. We have to make sure the buffer state
930 * is reasonable in this case since we will not be initiating
931 * I/O. See the comments in kern/vfs_bio.c's getblk() for
932 * more information.
933 *
934 * B_CACHE may also be set due to the buffer being cached
935 * normally.
936 */
937
938 if (on == 0 && n == bcount) {
939 bp->b_flags |= B_CACHE;
940 bp->b_flags &= ~B_INVAL;
941 bp->b_ioflags &= ~BIO_ERROR;
942 }
943
944 if ((bp->b_flags & B_CACHE) == 0) {
945 bp->b_iocmd = BIO_READ;
946 vfs_busy_pages(bp, 0);
947 error = nfs_doio(bp, cred, p);
948 if (error) {
949 brelse(bp);
950 break;
951 }
952 }
953 if (!bp) {
954 error = EINTR;
955 break;
956 }
957 if (bp->b_wcred == NOCRED) {
958 crhold(cred);
959 bp->b_wcred = cred;
960 }
961 np->n_flag |= NMODIFIED;
962
963 /*
964 * If dirtyend exceeds file size, chop it down. This should
965 * not normally occur but there is an append race where it
966 * might occur XXX, so we log it.
967 *
968 * If the chopping creates a reverse-indexed or degenerate
969 * situation with dirtyoff/end, we 0 both of them.
970 */
971
972 if (bp->b_dirtyend > bcount) {
973 printf("NFS append race @%lx:%d\n",
974 (long)bp->b_blkno * DEV_BSIZE,
975 bp->b_dirtyend - bcount);
976 bp->b_dirtyend = bcount;
977 }
978
979 if (bp->b_dirtyoff >= bp->b_dirtyend)
980 bp->b_dirtyoff = bp->b_dirtyend = 0;
981
982 /*
983 * If the new write will leave a contiguous dirty
984 * area, just update the b_dirtyoff and b_dirtyend,
985 * otherwise force a write rpc of the old dirty area.
986 *
987 * While it is possible to merge discontiguous writes due to
988 * our having a B_CACHE buffer ( and thus valid read data
989 * for the hole), we don't because it could lead to
990 * significant cache coherency problems with multiple clients,
991 * especially if locking is implemented later on.
992 *
993 * as an optimization we could theoretically maintain
994 * a linked list of discontinuous areas, but we would still
995 * have to commit them separately so there isn't much
996 * advantage to it except perhaps a bit of asynchronization.
997 */
998
999 if (bp->b_dirtyend > 0 &&
1000 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
1001 if (BUF_WRITE(bp) == EINTR)
1002 return (EINTR);
1003 goto again;
1004 }
1005
1006 /*
1007 * Check for valid write lease and get one as required.
1008 * In case getblk() and/or bwrite() delayed us.
1009 */
1010 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
1011 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1012 do {
1013 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1014 } while (error == NQNFS_EXPIRED);
1015 if (error) {
1016 brelse(bp);
1017 break;
1018 }
1019 if (np->n_lrev != np->n_brev ||
1020 (np->n_flag & NQNFSNONCACHE)) {
1021 brelse(bp);
1022 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1023 if (error)
1024 break;
1025 np->n_brev = np->n_lrev;
1026 goto again;
1027 }
1028 }
1029
1030 error = uiomove((char *)bp->b_data + on, n, uio);
1031
1032 /*
1033 * Since this block is being modified, it must be written
1034 * again and not just committed. Since write clustering does
1035 * not work for the stage 1 data write, only the stage 2
1036 * commit rpc, we have to clear B_CLUSTEROK as well.
1037 */
1038 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1039
1040 if (error) {
1041 bp->b_ioflags |= BIO_ERROR;
1042 brelse(bp);
1043 break;
1044 }
1045
1046 /*
1047 * Only update dirtyoff/dirtyend if not a degenerate
1048 * condition.
1049 */
1050 if (n) {
1051 if (bp->b_dirtyend > 0) {
1052 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
1053 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
1054 } else {
1055 bp->b_dirtyoff = on;
1056 bp->b_dirtyend = on + n;
1057 }
1058 vfs_bio_set_validclean(bp, on, n);
1059 }
1060
1061 /*
1062 * If the lease is non-cachable or IO_SYNC do bwrite().
1063 *
1064 * IO_INVAL appears to be unused. The idea appears to be
1065 * to turn off caching in this case. Very odd. XXX
1066 */
1067 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
1068 if (ioflag & IO_INVAL)
1069 bp->b_flags |= B_NOCACHE;
1070 error = BUF_WRITE(bp);
1071 if (error)
1072 break;
1073 if (np->n_flag & NQNFSNONCACHE) {
1074 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1075 if (error)
1076 break;
1077 }
1078 } else if ((n + on) == biosize &&
1079 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
1080 bp->b_flags |= B_ASYNC;
1081 (void)nfs_writebp(bp, 0, 0);
1082 } else {
1083 bdwrite(bp);
1084 }
1085 } while (uio->uio_resid > 0 && n > 0);
1086
1087 if (haverslock)
1088 nfs_rsunlock(np, p);
1089
1090 return (error);
1091}
1092
1093/*
1094 * Get an nfs cache block.
1095 *
1096 * Allocate a new one if the block isn't currently in the cache
1097 * and return the block marked busy. If the calling process is
1098 * interrupted by a signal for an interruptible mount point, return
1099 * NULL.
1100 *
1101 * The caller must carefully deal with the possible B_INVAL state of
1102 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
1103 * indirectly), so synchronous reads can be issued without worrying about
1104 * the B_INVAL state. We have to be a little more careful when dealing
1105 * with writes (see comments in nfs_write()) when extending a file past
1106 * its EOF.
1107 */
1108static struct buf *
1109nfs_getcacheblk(vp, bn, size, p)
1110 struct vnode *vp;
1111 daddr_t bn;
1112 int size;
1113 struct proc *p;
1114{
1115 register struct buf *bp;
1116 struct mount *mp;
1117 struct nfsmount *nmp;
1118
1119 mp = vp->v_mount;
1120 nmp = VFSTONFS(mp);
1121
1122 if (nmp->nm_flag & NFSMNT_INT) {
1123 bp = getblk(vp, bn, size, PCATCH, 0);
1124 while (bp == (struct buf *)0) {
1125 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
1126 return ((struct buf *)0);
1127 bp = getblk(vp, bn, size, 0, 2 * hz);
1128 }
1129 } else {
1130 bp = getblk(vp, bn, size, 0, 0);
1131 }
1132
1133 if (vp->v_type == VREG) {
1134 int biosize;
1135
1136 biosize = mp->mnt_stat.f_iosize;
1137 bp->b_blkno = bn * (biosize / DEV_BSIZE);
1138 }
1139 return (bp);
1140}
1141
1142/*
1143 * Flush and invalidate all dirty buffers. If another process is already
1144 * doing the flush, just wait for completion.
1145 */
1146int
1147nfs_vinvalbuf(vp, flags, cred, p, intrflg)
1148 struct vnode *vp;
1149 int flags;
1150 struct ucred *cred;
1151 struct proc *p;
1152 int intrflg;
1153{
1154 register struct nfsnode *np = VTONFS(vp);
1155 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1156 int error = 0, slpflag, slptimeo;
1157
1158 if (vp->v_flag & VXLOCK) {
1159 return (0);
1160 }
1161
1162 if ((nmp->nm_flag & NFSMNT_INT) == 0)
1163 intrflg = 0;
1164 if (intrflg) {
1165 slpflag = PCATCH;
1166 slptimeo = 2 * hz;
1167 } else {
1168 slpflag = 0;
1169 slptimeo = 0;
1170 }
1171 /*
1172 * First wait for any other process doing a flush to complete.
1173 */
1174 while (np->n_flag & NFLUSHINPROG) {
1175 np->n_flag |= NFLUSHWANT;
1176 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
1177 slptimeo);
1178 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
1179 return (EINTR);
1180 }
1181
1182 /*
1183 * Now, flush as required.
1184 */
1185 np->n_flag |= NFLUSHINPROG;
1186 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
1187 while (error) {
1188 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
1189 np->n_flag &= ~NFLUSHINPROG;
1190 if (np->n_flag & NFLUSHWANT) {
1191 np->n_flag &= ~NFLUSHWANT;
1192 wakeup((caddr_t)&np->n_flag);
1193 }
1194 return (EINTR);
1195 }
1196 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
1197 }
1198 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
1199 if (np->n_flag & NFLUSHWANT) {
1200 np->n_flag &= ~NFLUSHWANT;
1201 wakeup((caddr_t)&np->n_flag);
1202 }
1203 return (0);
1204}
1205
1206/*
1207 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
1208 * This is mainly to avoid queueing async I/O requests when the nfsiods
1209 * are all hung on a dead server.
1210 *
1211 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
1212 * is eventually dequeued by the async daemon, nfs_doio() *will*.
1213 */
1214int
1215nfs_asyncio(bp, cred, procp)
1216 register struct buf *bp;
1217 struct ucred *cred;
1218 struct proc *procp;
1219{
1220 struct nfsmount *nmp;
1221 int i;
1222 int gotiod;
1223 int slpflag = 0;
1224 int slptimeo = 0;
1225 int error;
1226
1227 /*
1228 * If no async daemons then return EIO to force caller to run the rpc
1229 * synchronously.
1230 */
1231 if (nfs_numasync == 0)
1232 return (EIO);
1233
1234 nmp = VFSTONFS(bp->b_vp->v_mount);
1235
1236 /*
1237 * Commits are usually short and sweet so lets save some cpu and
1238 * leave the async daemons for more important rpc's (such as reads
1239 * and writes).
1240 */
1241 if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
1242 (nmp->nm_bufqiods > nfs_numasync / 2)) {
1243 return(EIO);
1244 }
1245
1246again:
1247 if (nmp->nm_flag & NFSMNT_INT)
1248 slpflag = PCATCH;
1249 gotiod = FALSE;
1250
1251 /*
1252 * Find a free iod to process this request.
1253 */
1254 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
1255 if (nfs_iodwant[i]) {
1256 /*
1257 * Found one, so wake it up and tell it which
1258 * mount to process.
1259 */
1260 NFS_DPF(ASYNCIO,
1261 ("nfs_asyncio: waking iod %d for mount %p\n",
1262 i, nmp));
1263 nfs_iodwant[i] = (struct proc *)0;
1264 nfs_iodmount[i] = nmp;
1265 nmp->nm_bufqiods++;
1266 wakeup((caddr_t)&nfs_iodwant[i]);
1267 gotiod = TRUE;
1268 break;
1269 }
1270
1271 /*
1272 * If none are free, we may already have an iod working on this mount
1273 * point. If so, it will process our request.
1274 */
1275 if (!gotiod) {
1276 if (nmp->nm_bufqiods > 0) {
1277 NFS_DPF(ASYNCIO,
1278 ("nfs_asyncio: %d iods are already processing mount %p\n",
1279 nmp->nm_bufqiods, nmp));
1280 gotiod = TRUE;
1281 }
1282 }
1283
1284 /*
1285 * If we have an iod which can process the request, then queue
1286 * the buffer.
1287 */
1288 if (gotiod) {
1289 /*
1290 * Ensure that the queue never grows too large. We still want
1291 * to asynchronize so we block rather then return EIO.
1292 */
1293 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1294 NFS_DPF(ASYNCIO,
1295 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1296 nmp->nm_bufqwant = TRUE;
1297 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1298 "nfsaio", slptimeo);
1299 if (error) {
1300 if (nfs_sigintr(nmp, NULL, procp))
1301 return (EINTR);
1302 if (slpflag == PCATCH) {
1303 slpflag = 0;
1304 slptimeo = 2 * hz;
1305 }
1306 }
1307 /*
1308 * We might have lost our iod while sleeping,
1309 * so check and loop if nescessary.
1310 */
1311 if (nmp->nm_bufqiods == 0) {
1312 NFS_DPF(ASYNCIO,
1313 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1314 goto again;
1315 }
1316 }
1317
1318 if (bp->b_iocmd == BIO_READ) {
1319 if (bp->b_rcred == NOCRED && cred != NOCRED) {
1320 crhold(cred);
1321 bp->b_rcred = cred;
1322 }
1323 } else {
1324 bp->b_flags |= B_WRITEINPROG;
1325 if (bp->b_wcred == NOCRED && cred != NOCRED) {
1326 crhold(cred);
1327 bp->b_wcred = cred;
1328 }
1329 }
1330
1331 BUF_KERNPROC(bp);
1332 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1333 nmp->nm_bufqlen++;
1334 return (0);
1335 }
1336
1337 /*
1338 * All the iods are busy on other mounts, so return EIO to
1339 * force the caller to process the i/o synchronously.
1340 */
1341 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1342 return (EIO);
1343}
1344
1345/*
1346 * Do an I/O operation to/from a cache block. This may be called
1347 * synchronously or from an nfsiod.
1348 */
1349int
1350nfs_doio(bp, cr, p)
1351 struct buf *bp;
1352 struct ucred *cr;
1353 struct proc *p;
1354{
1355 struct uio *uiop;
1356 struct vnode *vp;
1357 struct nfsnode *np;
1358 struct nfsmount *nmp;
1359 int error = 0, iomode, must_commit = 0;
1360 struct uio uio;
1361 struct iovec io;
1362
1363 vp = bp->b_vp;
1364 np = VTONFS(vp);
1365 nmp = VFSTONFS(vp->v_mount);
1366 uiop = &uio;
1367 uiop->uio_iov = &io;
1368 uiop->uio_iovcnt = 1;
1369 uiop->uio_segflg = UIO_SYSSPACE;
1370 uiop->uio_procp = p;
1371
1372 /*
1373 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We
1374 * do this here so we do not have to do it in all the code that
1375 * calls us.
1376 */
1377 bp->b_flags &= ~B_INVAL;
1378 bp->b_ioflags &= ~BIO_ERROR;
1379
1380 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
1381
1382 /*
1383 * Historically, paging was done with physio, but no more.
1384 */
1385 if (bp->b_flags & B_PHYS) {
1386 /*
1387 * ...though reading /dev/drum still gets us here.
1388 */
1389 io.iov_len = uiop->uio_resid = bp->b_bcount;
1390 /* mapping was done by vmapbuf() */
1391 io.iov_base = bp->b_data;
1392 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1393 if (bp->b_iocmd == BIO_READ) {
1394 uiop->uio_rw = UIO_READ;
1395 nfsstats.read_physios++;
1396 error = nfs_readrpc(vp, uiop, cr);
1397 } else {
1398 int com;
1399
1400 iomode = NFSV3WRITE_DATASYNC;
1401 uiop->uio_rw = UIO_WRITE;
1402 nfsstats.write_physios++;
1403 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1404 }
1405 if (error) {
1406 bp->b_ioflags |= BIO_ERROR;
1407 bp->b_error = error;
1408 }
1409 } else if (bp->b_iocmd == BIO_READ) {
1410 io.iov_len = uiop->uio_resid = bp->b_bcount;
1411 io.iov_base = bp->b_data;
1412 uiop->uio_rw = UIO_READ;
1413 switch (vp->v_type) {
1414 case VREG:
1415 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1416 nfsstats.read_bios++;
1417 error = nfs_readrpc(vp, uiop, cr);
1418 if (!error) {
1419 if (uiop->uio_resid) {
1420 /*
1421 * If we had a short read with no error, we must have
1422 * hit a file hole. We should zero-fill the remainder.
1423 * This can also occur if the server hits the file EOF.
1424 *
1425 * Holes used to be able to occur due to pending
1426 * writes, but that is not possible any longer.
1427 */
1428 int nread = bp->b_bcount - uiop->uio_resid;
1429 int left = bp->b_bcount - nread;
1430
1431 if (left > 0)
1432 bzero((char *)bp->b_data + nread, left);
1433 uiop->uio_resid = 0;
1434 }
1435 }
1436 if (p && (vp->v_flag & VTEXT) &&
1437 (((nmp->nm_flag & NFSMNT_NQNFS) &&
1438 NQNFS_CKINVALID(vp, np, ND_READ) &&
1439 np->n_lrev != np->n_brev) ||
1440 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1441 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1442 uprintf("Process killed due to text file modification\n");
1443 psignal(p, SIGKILL);
1444 PHOLD(p);
1445 }
1446 break;
1447 case VLNK:
1448 uiop->uio_offset = (off_t)0;
1449 nfsstats.readlink_bios++;
1450 error = nfs_readlinkrpc(vp, uiop, cr);
1451 break;
1452 case VDIR:
1453 nfsstats.readdir_bios++;
1454 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1455 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1456 error = nfs_readdirplusrpc(vp, uiop, cr);
1457 if (error == NFSERR_NOTSUPP)
1458 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1459 }
1460 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1461 error = nfs_readdirrpc(vp, uiop, cr);
1462 /*
1463 * end-of-directory sets B_INVAL but does not generate an
1464 * error.
1465 */
1466 if (error == 0 && uiop->uio_resid == bp->b_bcount)
1467 bp->b_flags |= B_INVAL;
1468 break;
1469 default:
1470 printf("nfs_doio: type %x unexpected\n",vp->v_type);
1471 break;
1472 };
1473 if (error) {
1474 bp->b_ioflags |= BIO_ERROR;
1475 bp->b_error = error;
1476 }
1477 } else {
1478 /*
1479 * If we only need to commit, try to commit
1480 */
1481 if (bp->b_flags & B_NEEDCOMMIT) {
1482 int retv;
1483 off_t off;
1484
1485 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
1486 bp->b_flags |= B_WRITEINPROG;
1487 retv = nfs_commit(
1488 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
1489 bp->b_wcred, p);
1490 bp->b_flags &= ~B_WRITEINPROG;
1491 if (retv == 0) {
1492 bp->b_dirtyoff = bp->b_dirtyend = 0;
1493 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1494 bp->b_resid = 0;
1495 bufdone(bp);
1496 return (0);
1497 }
1498 if (retv == NFSERR_STALEWRITEVERF) {
1499 nfs_clearcommit(bp->b_vp->v_mount);
1500 }
1501 }
1502
1503 /*
1504 * Setup for actual write
1505 */
1506
1507 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1508 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1509
1510 if (bp->b_dirtyend > bp->b_dirtyoff) {
1511 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1512 - bp->b_dirtyoff;
1513 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
1514 + bp->b_dirtyoff;
1515 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1516 uiop->uio_rw = UIO_WRITE;
1517 nfsstats.write_bios++;
1518
1519 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1520 iomode = NFSV3WRITE_UNSTABLE;
1521 else
1522 iomode = NFSV3WRITE_FILESYNC;
1523
1524 bp->b_flags |= B_WRITEINPROG;
1525 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1526
1527 /*
1528 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1529 * to cluster the buffers needing commit. This will allow
1530 * the system to submit a single commit rpc for the whole
1531 * cluster. We can do this even if the buffer is not 100%
1532 * dirty (relative to the NFS blocksize), so we optimize the
1533 * append-to-file-case.
1534 *
1535 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1536 * cleared because write clustering only works for commit
1537 * rpc's, not for the data portion of the write).
1538 */
1539
1540 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1541 bp->b_flags |= B_NEEDCOMMIT;
1542 if (bp->b_dirtyoff == 0
1543 && bp->b_dirtyend == bp->b_bcount)
1544 bp->b_flags |= B_CLUSTEROK;
1545 } else {
1546 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1547 }
1548 bp->b_flags &= ~B_WRITEINPROG;
1549
1550 /*
1551 * For an interrupted write, the buffer is still valid
1552 * and the write hasn't been pushed to the server yet,
1553 * so we can't set BIO_ERROR and report the interruption
1554 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1555 * is not relevant, so the rpc attempt is essentially
1556 * a noop. For the case of a V3 write rpc not being
1557 * committed to stable storage, the block is still
1558 * dirty and requires either a commit rpc or another
1559 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1560 * the block is reused. This is indicated by setting
1561 * the B_DELWRI and B_NEEDCOMMIT flags.
1562 *
1563 * If the buffer is marked B_PAGING, it does not reside on
1564 * the vp's paging queues so we cannot call bdirty(). The
1565 * bp in this case is not an NFS cache block so we should
1566 * be safe. XXX
1567 */
1568 if (error == EINTR
1569 || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1570 int s;
1571
1572 s = splbio();
1573 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1574 if ((bp->b_flags & B_PAGING) == 0) {
1575 bdirty(bp);
1576 bp->b_flags &= ~B_DONE;
1577 }
1578 if (error && (bp->b_flags & B_ASYNC) == 0)
1579 bp->b_flags |= B_EINTR;
1580 splx(s);
1581 } else {
1582 if (error) {
1583 bp->b_ioflags |= BIO_ERROR;
1584 bp->b_error = np->n_error = error;
1585 np->n_flag |= NWRITEERR;
1586 }
1587 bp->b_dirtyoff = bp->b_dirtyend = 0;
1588 }
1589 } else {
1590 bp->b_resid = 0;
1591 bufdone(bp);
1592 return (0);
1593 }
1594 }
1595 bp->b_resid = uiop->uio_resid;
1596 if (must_commit)
1597 nfs_clearcommit(vp->v_mount);
1598 bufdone(bp);
1599 return (error);
1600}