Cross Reference: /freebsd-10.2-release/sys/nfsclient/nfs

Deleted Added

sdiff udiff text old ( 59249 ) new ( 60041 )

full compact

nfs_bio.c (59249)	nfs_bio.c (60041)
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95	1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
37 * $FreeBSD: head/sys/nfsclient/nfs_bio.c 59249 2000-04-15 05:54:02Z phk $	37 * $FreeBSD: head/sys/nfsclient/nfs_bio.c 60041 2000-05-05 09:59:14Z phk $
38 */ 39 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/resourcevar.h> 44#include <sys/signalvar.h> 45#include <sys/proc.h>	38 */ 39 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/resourcevar.h> 44#include <sys/signalvar.h> 45#include <sys/proc.h>
	46#include <sys/bio.h>
46#include <sys/buf.h> 47#include <sys/vnode.h> 48#include <sys/mount.h> 49#include <sys/kernel.h> 50 51#include <vm/vm.h> 52#include <vm/vm_extern.h> 53#include <vm/vm_page.h> 54#include <vm/vm_object.h> 55#include <vm/vm_pager.h> 56#include <vm/vnode_pager.h> 57 58#include <nfs/rpcv2.h> 59#include <nfs/nfsproto.h> 60#include <nfs/nfs.h> 61#include <nfs/nfsmount.h> 62#include <nfs/nqnfs.h> 63#include <nfs/nfsnode.h> 64 65static struct buf nfs_getcacheblk __P((struct vnode vp, daddr_t bn, int size, 66 struct proc p)); 67 68extern int nfs_numasync; 69extern int nfs_pbuf_freecnt; 70extern struct nfsstats nfsstats; 71 72/ 73 * Vnode op for VM getpages. 74 / 75int 76nfs_getpages(ap) 77 struct vop_getpages_args / { 78 struct vnode a_vp; 79 vm_page_t a_m; 80 int a_count; 81 int a_reqpage; 82 vm_ooffset_t a_offset; 83 } / ap; 84{ 85 int i, error, nextoff, size, toff, count, npages; 86 struct uio uio; 87 struct iovec iov; 88 vm_offset_t kva; 89 struct buf bp; 90 struct vnode vp; 91 struct proc p; 92 struct ucred cred; 93 struct nfsmount nmp; 94 vm_page_t pages; 95 96 vp = ap->a_vp; 97 p = curproc; /* XXX / 98 cred = curproc->p_ucred; / XXX / 99 nmp = VFSTONFS(vp->v_mount); 100* pages = ap->a_m; 101 count = ap->a_count; 102 103 if (vp->v_object == NULL) { 104 printf("nfs_getpages: called with non-merged cache vnode??\n"); 105 return VM_PAGER_ERROR; 106 } 107 108 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 109 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 110 (void)nfs_fsinfo(nmp, vp, cred, p); 111 112 npages = btoc(count); 113 114 /* 115 * If the requested page is partially valid, just return it and 116 * allow the pager to zero-out the blanks. Partially valid pages 117 * can only occur at the file EOF. 118 / 119* 120 { 121 vm_page_t m = pages[ap->a_reqpage]; 122 123 if (m->valid != 0) { 124 /* handled by vm_fault now / 125* /* vm_page_zero_invalid(m, TRUE); / 126* for (i = 0; i < npages; ++i) { 127 if (i != ap->a_reqpage) 128 vnode_pager_freepage(pages[i]); 129 } 130 return(0); 131 } 132 } 133 134 /* 135 * We use only the kva address for the buffer, but this is extremely 136 * convienient and fast. 137 / 138* bp = getpbuf(&nfs_pbuf_freecnt); 139 140 kva = (vm_offset_t) bp->b_data; 141 pmap_qenter(kva, pages, npages); 142 143 iov.iov_base = (caddr_t) kva; 144 iov.iov_len = count; 145 uio.uio_iov = &iov; 146 uio.uio_iovcnt = 1; 147 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 148 uio.uio_resid = count; 149 uio.uio_segflg = UIO_SYSSPACE; 150 uio.uio_rw = UIO_READ; 151 uio.uio_procp = p; 152 153 error = nfs_readrpc(vp, &uio, cred); 154 pmap_qremove(kva, npages); 155 156 relpbuf(bp, &nfs_pbuf_freecnt); 157 158 if (error && (uio.uio_resid == count)) { 159 printf("nfs_getpages: error %d\n", error); 160 for (i = 0; i < npages; ++i) { 161 if (i != ap->a_reqpage) 162 vnode_pager_freepage(pages[i]); 163 } 164 return VM_PAGER_ERROR; 165 } 166 167 /* 168 * Calculate the number of bytes read and validate only that number 169 * of bytes. Note that due to pending writes, size may be 0. This 170 * does not mean that the remaining data is invalid! 171 / 172* 173 size = count - uio.uio_resid; 174 175 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 176 vm_page_t m; 177 nextoff = toff + PAGE_SIZE; 178 m = pages[i]; 179 180 m->flags &= ~PG_ZERO; 181 182 if (nextoff <= size) { 183 /* 184 * Read operation filled an entire page 185 / 186* m->valid = VM_PAGE_BITS_ALL; 187 vm_page_undirty(m); 188 } else if (size > toff) { 189 /* 190 * Read operation filled a partial page. 191 / 192* m->valid = 0; 193 vm_page_set_validclean(m, 0, size - toff); 194 /* handled by vm_fault now / 195* /* vm_page_zero_invalid(m, TRUE); / 196* } 197 198 if (i != ap->a_reqpage) { 199 /* 200 * Whether or not to leave the page activated is up in 201 * the air, but we should put the page on a page queue 202 * somewhere (it already is in the object). Result: 203 * It appears that emperical results show that 204 * deactivating pages is best. 205 / 206* 207 /* 208 * Just in case someone was asking for this page we 209 * now tell them that it is ok to use. 210 / 211* if (!error) { 212 if (m->flags & PG_WANTED) 213 vm_page_activate(m); 214 else 215 vm_page_deactivate(m); 216 vm_page_wakeup(m); 217 } else { 218 vnode_pager_freepage(m); 219 } 220 } 221 } 222 return 0; 223} 224 225/* 226 * Vnode op for VM putpages. 227 / 228int 229nfs_putpages(ap) 230* struct vop_putpages_args /* { 231 struct vnode a_vp; 232* vm_page_t a_m; 233* int a_count; 234 int a_sync; 235 int a_rtvals; 236* vm_ooffset_t a_offset; 237 } / ap; 238{ 239 struct uio uio; 240 struct iovec iov; 241 vm_offset_t kva; 242 struct buf bp; 243* int iomode, must_commit, i, error, npages, count; 244 off_t offset; 245 int rtvals; 246* struct vnode vp; 247* struct proc p; 248* struct ucred cred; 249* struct nfsmount nmp; 250* struct nfsnode np; 251* vm_page_t pages; 252* 253 vp = ap->a_vp; 254 np = VTONFS(vp); 255 p = curproc; /* XXX / 256* cred = curproc->p_ucred; /* XXX / 257* nmp = VFSTONFS(vp->v_mount); 258 pages = ap->a_m; 259 count = ap->a_count; 260 rtvals = ap->a_rtvals; 261 npages = btoc(count); 262 offset = IDX_TO_OFF(pages[0]->pindex); 263 264 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 265 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 266 (void)nfs_fsinfo(nmp, vp, cred, p); 267 268 for (i = 0; i < npages; i++) { 269 rtvals[i] = VM_PAGER_AGAIN; 270 } 271 272 /* 273 * When putting pages, do not extend file past EOF. 274 / 275* 276 if (offset + count > np->n_size) { 277 count = np->n_size - offset; 278 if (count < 0) 279 count = 0; 280 } 281 282 /* 283 * We use only the kva address for the buffer, but this is extremely 284 * convienient and fast. 285 / 286* bp = getpbuf(&nfs_pbuf_freecnt); 287 288 kva = (vm_offset_t) bp->b_data; 289 pmap_qenter(kva, pages, npages); 290 291 iov.iov_base = (caddr_t) kva; 292 iov.iov_len = count; 293 uio.uio_iov = &iov; 294 uio.uio_iovcnt = 1; 295 uio.uio_offset = offset; 296 uio.uio_resid = count; 297 uio.uio_segflg = UIO_SYSSPACE; 298 uio.uio_rw = UIO_WRITE; 299 uio.uio_procp = p; 300 301 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 302 iomode = NFSV3WRITE_UNSTABLE; 303 else 304 iomode = NFSV3WRITE_FILESYNC; 305 306 error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); 307 308 pmap_qremove(kva, npages); 309 relpbuf(bp, &nfs_pbuf_freecnt); 310 311 if (!error) { 312 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 313 for (i = 0; i < nwritten; i++) { 314 rtvals[i] = VM_PAGER_OK; 315 vm_page_undirty(pages[i]); 316 } 317 if (must_commit) 318 nfs_clearcommit(vp->v_mount); 319 } 320 return rtvals[0]; 321} 322 323/* 324 * Vnode op for read using bio 325 / 326int 327nfs_bioread(vp, uio, ioflag, cred) 328* register struct vnode vp; 329* register struct uio uio; 330* int ioflag; 331 struct ucred cred; 332{ 333* register struct nfsnode np = VTONFS(vp); 334* register int biosize, i; 335 struct buf bp = 0, rabp; 336 struct vattr vattr; 337 struct proc p; 338* struct nfsmount nmp = VFSTONFS(vp->v_mount); 339* daddr_t lbn, rabn; 340 int bcount; 341 int seqcount; 342 int nra, error = 0, n = 0, on = 0; 343 344#ifdef DIAGNOSTIC 345 if (uio->uio_rw != UIO_READ) 346 panic("nfs_read mode"); 347#endif 348 if (uio->uio_resid == 0) 349 return (0); 350 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative / 351* return (EINVAL); 352 p = uio->uio_procp; 353 354 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 355 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 356 (void)nfs_fsinfo(nmp, vp, cred, p); 357 if (vp->v_type != VDIR && 358 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 359 return (EFBIG); 360 biosize = vp->v_mount->mnt_stat.f_iosize; 361 seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE); 362 /* 363 * For nfs, cache consistency can only be maintained approximately. 364 * Although RFC1094 does not specify the criteria, the following is 365 * believed to be compatible with the reference port. 366 * For nqnfs, full cache consistency is maintained within the loop. 367 * For nfs: 368 * If the file's modify time on the server has changed since the 369 * last read rpc or you have written to the file, 370 * you may have lost data cache consistency with the 371 * server, so flush all of the file's data out of the cache. 372 * Then force a getattr rpc to ensure that you have up to date 373 * attributes. 374 * NB: This implies that cache data can be read when up to 375 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 376 * attributes this could be forced by setting n_attrstamp to 0 before 377 * the VOP_GETATTR() call. 378 / 379* if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 380 if (np->n_flag & NMODIFIED) { 381 if (vp->v_type != VREG) { 382 if (vp->v_type != VDIR) 383 panic("nfs: bioread, not dir"); 384 nfs_invaldir(vp); 385 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 386 if (error) 387 return (error); 388 } 389 np->n_attrstamp = 0; 390 error = VOP_GETATTR(vp, &vattr, cred, p); 391 if (error) 392 return (error); 393 np->n_mtime = vattr.va_mtime.tv_sec; 394 } else { 395 error = VOP_GETATTR(vp, &vattr, cred, p); 396 if (error) 397 return (error); 398 if (np->n_mtime != vattr.va_mtime.tv_sec) { 399 if (vp->v_type == VDIR) 400 nfs_invaldir(vp); 401 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 402 if (error) 403 return (error); 404 np->n_mtime = vattr.va_mtime.tv_sec; 405 } 406 } 407 } 408 do { 409 410 /* 411 * Get a valid lease. If cached data is stale, flush it. 412 / 413* if (nmp->nm_flag & NFSMNT_NQNFS) { 414 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 415 do { 416 error = nqnfs_getlease(vp, ND_READ, cred, p); 417 } while (error == NQNFS_EXPIRED); 418 if (error) 419 return (error); 420 if (np->n_lrev != np->n_brev \|\| 421 (np->n_flag & NQNFSNONCACHE) \|\| 422 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 423 if (vp->v_type == VDIR) 424 nfs_invaldir(vp); 425 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 426 if (error) 427 return (error); 428 np->n_brev = np->n_lrev; 429 } 430 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 431 nfs_invaldir(vp); 432 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 433 if (error) 434 return (error); 435 } 436 } 437 if (np->n_flag & NQNFSNONCACHE) { 438 switch (vp->v_type) { 439 case VREG: 440 return (nfs_readrpc(vp, uio, cred)); 441 case VLNK: 442 return (nfs_readlinkrpc(vp, uio, cred)); 443 case VDIR: 444 break; 445 default: 446 printf(" NQNFSNONCACHE: type %x unexpected\n", 447 vp->v_type); 448 }; 449 } 450 switch (vp->v_type) { 451 case VREG: 452 nfsstats.biocache_reads++; 453 lbn = uio->uio_offset / biosize; 454 on = uio->uio_offset & (biosize - 1); 455 456 /* 457 * Start the read ahead(s), as required. 458 / 459* if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 460 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 461 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 462 rabn = lbn + 1 + nra; 463 if (!incore(vp, rabn)) { 464 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 465 if (!rabp) 466 return (EINTR); 467 if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) { 468 rabp->b_flags \|= B_ASYNC; 469 rabp->b_iocmd = BIO_READ; 470 vfs_busy_pages(rabp, 0); 471 if (nfs_asyncio(rabp, cred, p)) { 472 rabp->b_flags \|= B_INVAL; 473 rabp->b_ioflags \|= BIO_ERROR; 474 vfs_unbusy_pages(rabp); 475 brelse(rabp); 476 break; 477 } 478 } else { 479 brelse(rabp); 480 } 481 } 482 } 483 } 484 485 /* 486 * Obtain the buffer cache block. Figure out the buffer size 487 * when we are at EOF. If we are modifying the size of the 488 * buffer based on an EOF condition we need to hold 489 * nfs_rslock() through obtaining the buffer to prevent 490 * a potential writer-appender from messing with n_size. 491 * Otherwise we may accidently truncate the buffer and 492 * lose dirty data. 493 * 494 * Note that bcount is not DEV_BSIZE aligned. 495 / 496* 497again: 498 bcount = biosize; 499 if ((off_t)lbn * biosize >= np->n_size) { 500 bcount = 0; 501 } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 502 bcount = np->n_size - (off_t)lbn * biosize; 503 } 504 if (bcount != biosize) { 505 switch(nfs_rslock(np, p)) { 506 case ENOLCK: 507 goto again; 508 /* not reached / 509* case EINTR: 510 case ERESTART: 511 return(EINTR); 512 /* not reached / 513* default: 514 break; 515 } 516 } 517 518 bp = nfs_getcacheblk(vp, lbn, bcount, p); 519 520 if (bcount != biosize) 521 nfs_rsunlock(np, p); 522 if (!bp) 523 return (EINTR); 524 525 /* 526 * If B_CACHE is not set, we must issue the read. If this 527 * fails, we return an error. 528 / 529* 530 if ((bp->b_flags & B_CACHE) == 0) { 531 bp->b_iocmd = BIO_READ; 532 vfs_busy_pages(bp, 0); 533 error = nfs_doio(bp, cred, p); 534 if (error) { 535 brelse(bp); 536 return (error); 537 } 538 } 539 540 /* 541 * on is the offset into the current bp. Figure out how many 542 * bytes we can copy out of the bp. Note that bcount is 543 * NOT DEV_BSIZE aligned. 544 * 545 * Then figure out how many bytes we can copy into the uio. 546 / 547* 548 n = 0; 549 if (on < bcount) 550 n = min((unsigned)(bcount - on), uio->uio_resid); 551 break; 552 case VLNK: 553 nfsstats.biocache_readlinks++; 554 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 555 if (!bp) 556 return (EINTR); 557 if ((bp->b_flags & B_CACHE) == 0) { 558 bp->b_iocmd = BIO_READ; 559 vfs_busy_pages(bp, 0); 560 error = nfs_doio(bp, cred, p); 561 if (error) { 562 bp->b_ioflags \|= BIO_ERROR; 563 brelse(bp); 564 return (error); 565 } 566 } 567 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 568 on = 0; 569 break; 570 case VDIR: 571 nfsstats.biocache_readdirs++; 572 if (np->n_direofoffset 573 && uio->uio_offset >= np->n_direofoffset) { 574 return (0); 575 } 576 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 577 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 578 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 579 if (!bp) 580 return (EINTR); 581 if ((bp->b_flags & B_CACHE) == 0) { 582 bp->b_iocmd = BIO_READ; 583 vfs_busy_pages(bp, 0); 584 error = nfs_doio(bp, cred, p); 585 if (error) { 586 brelse(bp); 587 } 588 while (error == NFSERR_BAD_COOKIE) { 589 printf("got bad cookie vp %p bp %p\n", vp, bp); 590 nfs_invaldir(vp); 591 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 592 /* 593 * Yuck! The directory has been modified on the 594 * server. The only way to get the block is by 595 * reading from the beginning to get all the 596 * offset cookies. 597 * 598 * Leave the last bp intact unless there is an error. 599 * Loop back up to the while if the error is another 600 * NFSERR_BAD_COOKIE (double yuch!). 601 / 602* for (i = 0; i <= lbn && !error; i++) { 603 if (np->n_direofoffset 604 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 605 return (0); 606 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 607 if (!bp) 608 return (EINTR); 609 if ((bp->b_flags & B_CACHE) == 0) { 610 bp->b_iocmd = BIO_READ; 611 vfs_busy_pages(bp, 0); 612 error = nfs_doio(bp, cred, p); 613 /* 614 * no error + B_INVAL == directory EOF, 615 * use the block. 616 / 617* if (error == 0 && (bp->b_flags & B_INVAL)) 618 break; 619 } 620 /* 621 * An error will throw away the block and the 622 * for loop will break out. If no error and this 623 * is not the block we want, we throw away the 624 * block and go for the next one via the for loop. 625 / 626* if (error \|\| i < lbn) 627 brelse(bp); 628 } 629 } 630 /* 631 * The above while is repeated if we hit another cookie 632 * error. If we hit an error and it wasn't a cookie error, 633 * we give up. 634 / 635* if (error) 636 return (error); 637 } 638 639 /* 640 * If not eof and read aheads are enabled, start one. 641 * (You need the current block first, so that you have the 642 * directory offset cookie of the next block.) 643 / 644* if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 645 (bp->b_flags & B_INVAL) == 0 && 646 (np->n_direofoffset == 0 \|\| 647 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 648 !(np->n_flag & NQNFSNONCACHE) && 649 !incore(vp, lbn + 1)) { 650 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 651 if (rabp) { 652 if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) { 653 rabp->b_flags \|= B_ASYNC; 654 rabp->b_iocmd = BIO_READ; 655 vfs_busy_pages(rabp, 0); 656 if (nfs_asyncio(rabp, cred, p)) { 657 rabp->b_flags \|= B_INVAL; 658 rabp->b_ioflags \|= BIO_ERROR; 659 vfs_unbusy_pages(rabp); 660 brelse(rabp); 661 } 662 } else { 663 brelse(rabp); 664 } 665 } 666 } 667 /* 668 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 669 * chopped for the EOF condition, we cannot tell how large 670 * NFS directories are going to be until we hit EOF. So 671 * an NFS directory buffer is not chopped to its EOF. Now, 672 * it just so happens that b_resid will effectively chop it 673 * to EOF. BUT this information is lost if the buffer goes 674 * away and is reconstituted into a B_CACHE state ( due to 675 * being VMIO ) later. So we keep track of the directory eof 676 * in np->n_direofoffset and chop it off as an extra step 677 * right here. 678 / 679* n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 680 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 681 n = np->n_direofoffset - uio->uio_offset; 682 break; 683 default: 684 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 685 break; 686 }; 687 688 if (n > 0) { 689 error = uiomove(bp->b_data + on, (int)n, uio); 690 } 691 switch (vp->v_type) { 692 case VREG: 693 break; 694 case VLNK: 695 n = 0; 696 break; 697 case VDIR: 698 /* 699 * Invalidate buffer if caching is disabled, forcing a 700 * re-read from the remote later. 701 / 702* if (np->n_flag & NQNFSNONCACHE) 703 bp->b_flags \|= B_INVAL; 704 break; 705 default: 706 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 707 } 708 brelse(bp); 709 } while (error == 0 && uio->uio_resid > 0 && n > 0); 710 return (error); 711} 712 713/* 714 * Vnode op for write using bio 715 / 716int 717nfs_write(ap) 718* struct vop_write_args /* { 719 struct vnode a_vp; 720* struct uio a_uio; 721* int a_ioflag; 722 struct ucred a_cred; 723* } / ap; 724{ 725 int biosize; 726 struct uio uio = ap->a_uio; 727* struct proc p = uio->uio_procp; 728* struct vnode vp = ap->a_vp; 729* struct nfsnode np = VTONFS(vp); 730* struct ucred cred = ap->a_cred; 731* int ioflag = ap->a_ioflag; 732 struct buf bp; 733* struct vattr vattr; 734 struct nfsmount nmp = VFSTONFS(vp->v_mount); 735* daddr_t lbn; 736 int bcount; 737 int n, on, error = 0, iomode, must_commit; 738 int haverslock = 0; 739 740#ifdef DIAGNOSTIC 741 if (uio->uio_rw != UIO_WRITE) 742 panic("nfs_write mode"); 743 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 744 panic("nfs_write proc"); 745#endif 746 if (vp->v_type != VREG) 747 return (EIO); 748 if (np->n_flag & NWRITEERR) { 749 np->n_flag &= ~NWRITEERR; 750 return (np->n_error); 751 } 752 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 753 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 754 (void)nfs_fsinfo(nmp, vp, cred, p); 755 756 /* 757 * Synchronously flush pending buffers if we are in synchronous 758 * mode or if we are appending. 759 / 760* if (ioflag & (IO_APPEND \| IO_SYNC)) { 761 if (np->n_flag & NMODIFIED) { 762 np->n_attrstamp = 0; 763 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 764 if (error) 765 return (error); 766 } 767 } 768 769 /* 770 * If IO_APPEND then load uio_offset. We restart here if we cannot 771 * get the append lock. 772 / 773restart: 774* if (ioflag & IO_APPEND) { 775 np->n_attrstamp = 0; 776 error = VOP_GETATTR(vp, &vattr, cred, p); 777 if (error) 778 return (error); 779 uio->uio_offset = np->n_size; 780 } 781 782 if (uio->uio_offset < 0) 783 return (EINVAL); 784 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 785 return (EFBIG); 786 if (uio->uio_resid == 0) 787 return (0); 788 789 /* 790 * We need to obtain the rslock if we intend to modify np->n_size 791 * in order to guarentee the append point with multiple contending 792 * writers, to guarentee that no other appenders modify n_size 793 * while we are trying to obtain a truncated buffer (i.e. to avoid 794 * accidently truncating data written by another appender due to 795 * the race), and to ensure that the buffer is populated prior to 796 * our extending of the file. We hold rslock through the entire 797 * operation. 798 * 799 * Note that we do not synchronize the case where someone truncates 800 * the file while we are appending to it because attempting to lock 801 * this case may deadlock other parts of the system unexpectedly. 802 / 803* if ((ioflag & IO_APPEND) \|\| 804 uio->uio_offset + uio->uio_resid > np->n_size) { 805 switch(nfs_rslock(np, p)) { 806 case ENOLCK: 807 goto restart; 808 /* not reached / 809* case EINTR: 810 case ERESTART: 811 return(EINTR); 812 /* not reached / 813* default: 814 break; 815 } 816 haverslock = 1; 817 } 818 819 /* 820 * Maybe this should be above the vnode op call, but so long as 821 * file servers have no limits, i don't think it matters 822 / 823* if (p && uio->uio_offset + uio->uio_resid > 824 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 825 psignal(p, SIGXFSZ); 826 if (haverslock) 827 nfs_rsunlock(np, p); 828 return (EFBIG); 829 } 830 831 biosize = vp->v_mount->mnt_stat.f_iosize; 832 833 do { 834 /* 835 * Check for a valid write lease. 836 / 837* if ((nmp->nm_flag & NFSMNT_NQNFS) && 838 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 839 do { 840 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 841 } while (error == NQNFS_EXPIRED); 842 if (error) 843 break; 844 if (np->n_lrev != np->n_brev \|\| 845 (np->n_flag & NQNFSNONCACHE)) { 846 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 847 if (error) 848 break; 849 np->n_brev = np->n_lrev; 850 } 851 } 852 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 853 iomode = NFSV3WRITE_FILESYNC; 854 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 855 if (must_commit) 856 nfs_clearcommit(vp->v_mount); 857 break; 858 } 859 nfsstats.biocache_writes++; 860 lbn = uio->uio_offset / biosize; 861 on = uio->uio_offset & (biosize-1); 862 n = min((unsigned)(biosize - on), uio->uio_resid); 863again: 864 /* 865 * Handle direct append and file extension cases, calculate 866 * unaligned buffer size. 867 / 868* 869 if (uio->uio_offset == np->n_size && n) { 870 /* 871 * Get the buffer (in its pre-append state to maintain 872 * B_CACHE if it was previously set). Resize the 873 * nfsnode after we have locked the buffer to prevent 874 * readers from reading garbage. 875 / 876* bcount = on; 877 bp = nfs_getcacheblk(vp, lbn, bcount, p); 878 879 if (bp != NULL) { 880 long save; 881 882 np->n_size = uio->uio_offset + n; 883 np->n_flag \|= NMODIFIED; 884 vnode_pager_setsize(vp, np->n_size); 885 886 save = bp->b_flags & B_CACHE; 887 bcount += n; 888 allocbuf(bp, bcount); 889 bp->b_flags \|= save; 890 } 891 } else { 892 /* 893 * Obtain the locked cache block first, and then 894 * adjust the file's size as appropriate. 895 / 896* bcount = on + n; 897 if ((off_t)lbn * biosize + bcount < np->n_size) { 898 if ((off_t)(lbn + 1) * biosize < np->n_size) 899 bcount = biosize; 900 else 901 bcount = np->n_size - (off_t)lbn * biosize; 902 } 903 904 bp = nfs_getcacheblk(vp, lbn, bcount, p); 905 906 if (uio->uio_offset + n > np->n_size) { 907 np->n_size = uio->uio_offset + n; 908 np->n_flag \|= NMODIFIED; 909 vnode_pager_setsize(vp, np->n_size); 910 } 911 } 912 913 if (!bp) { 914 error = EINTR; 915 break; 916 } 917 918 /* 919 * Issue a READ if B_CACHE is not set. In special-append 920 * mode, B_CACHE is based on the buffer prior to the write 921 * op and is typically set, avoiding the read. If a read 922 * is required in special append mode, the server will 923 * probably send us a short-read since we extended the file 924 * on our end, resulting in b_resid == 0 and, thusly, 925 * B_CACHE getting set. 926 * 927 * We can also avoid issuing the read if the write covers 928 * the entire buffer. We have to make sure the buffer state 929 * is reasonable in this case since we will not be initiating 930 * I/O. See the comments in kern/vfs_bio.c's getblk() for 931 * more information. 932 * 933 * B_CACHE may also be set due to the buffer being cached 934 * normally. 935 / 936* 937 if (on == 0 && n == bcount) { 938 bp->b_flags \|= B_CACHE; 939 bp->b_flags &= ~B_INVAL; 940 bp->b_ioflags &= ~BIO_ERROR; 941 } 942 943 if ((bp->b_flags & B_CACHE) == 0) { 944 bp->b_iocmd = BIO_READ; 945 vfs_busy_pages(bp, 0); 946 error = nfs_doio(bp, cred, p); 947 if (error) { 948 brelse(bp); 949 break; 950 } 951 } 952 if (!bp) { 953 error = EINTR; 954 break; 955 } 956 if (bp->b_wcred == NOCRED) { 957 crhold(cred); 958 bp->b_wcred = cred; 959 } 960 np->n_flag \|= NMODIFIED; 961 962 /* 963 * If dirtyend exceeds file size, chop it down. This should 964 * not normally occur but there is an append race where it 965 * might occur XXX, so we log it. 966 * 967 * If the chopping creates a reverse-indexed or degenerate 968 * situation with dirtyoff/end, we 0 both of them. 969 / 970* 971 if (bp->b_dirtyend > bcount) { 972 printf("NFS append race @%lx:%d\n", 973 (long)bp->b_blkno * DEV_BSIZE, 974 bp->b_dirtyend - bcount); 975 bp->b_dirtyend = bcount; 976 } 977 978 if (bp->b_dirtyoff >= bp->b_dirtyend) 979 bp->b_dirtyoff = bp->b_dirtyend = 0; 980 981 /* 982 * If the new write will leave a contiguous dirty 983 * area, just update the b_dirtyoff and b_dirtyend, 984 * otherwise force a write rpc of the old dirty area. 985 * 986 * While it is possible to merge discontiguous writes due to 987 * our having a B_CACHE buffer ( and thus valid read data 988 * for the hole), we don't because it could lead to 989 * significant cache coherency problems with multiple clients, 990 * especially if locking is implemented later on. 991 * 992 * as an optimization we could theoretically maintain 993 * a linked list of discontinuous areas, but we would still 994 * have to commit them separately so there isn't much 995 * advantage to it except perhaps a bit of asynchronization. 996 / 997* 998 if (bp->b_dirtyend > 0 && 999 (on > bp->b_dirtyend \|\| (on + n) < bp->b_dirtyoff)) { 1000 if (BUF_WRITE(bp) == EINTR) 1001 return (EINTR); 1002 goto again; 1003 } 1004 1005 /* 1006 * Check for valid write lease and get one as required. 1007 * In case getblk() and/or bwrite() delayed us. 1008 / 1009* if ((nmp->nm_flag & NFSMNT_NQNFS) && 1010 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 1011 do { 1012 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 1013 } while (error == NQNFS_EXPIRED); 1014 if (error) { 1015 brelse(bp); 1016 break; 1017 } 1018 if (np->n_lrev != np->n_brev \|\| 1019 (np->n_flag & NQNFSNONCACHE)) { 1020 brelse(bp); 1021 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1022 if (error) 1023 break; 1024 np->n_brev = np->n_lrev; 1025 goto again; 1026 } 1027 } 1028 1029 error = uiomove((char )bp->b_data + on, n, uio); 1030* 1031 /* 1032 * Since this block is being modified, it must be written 1033 * again and not just committed. Since write clustering does 1034 * not work for the stage 1 data write, only the stage 2 1035 * commit rpc, we have to clear B_CLUSTEROK as well. 1036 / 1037* bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK); 1038 1039 if (error) { 1040 bp->b_ioflags \|= BIO_ERROR; 1041 brelse(bp); 1042 break; 1043 } 1044 1045 /* 1046 * Only update dirtyoff/dirtyend if not a degenerate 1047 * condition. 1048 / 1049* if (n) { 1050 if (bp->b_dirtyend > 0) { 1051 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 1052 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 1053 } else { 1054 bp->b_dirtyoff = on; 1055 bp->b_dirtyend = on + n; 1056 } 1057 vfs_bio_set_validclean(bp, on, n); 1058 } 1059 1060 /* 1061 * If the lease is non-cachable or IO_SYNC do bwrite(). 1062 * 1063 * IO_INVAL appears to be unused. The idea appears to be 1064 * to turn off caching in this case. Very odd. XXX 1065 / 1066* if ((np->n_flag & NQNFSNONCACHE) \|\| (ioflag & IO_SYNC)) { 1067 if (ioflag & IO_INVAL) 1068 bp->b_flags \|= B_NOCACHE; 1069 error = BUF_WRITE(bp); 1070 if (error) 1071 break; 1072 if (np->n_flag & NQNFSNONCACHE) { 1073 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1074 if (error) 1075 break; 1076 } 1077 } else if ((n + on) == biosize && 1078 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 1079 bp->b_flags \|= B_ASYNC; 1080 (void)nfs_writebp(bp, 0, 0); 1081 } else { 1082 bdwrite(bp); 1083 } 1084 } while (uio->uio_resid > 0 && n > 0); 1085 1086 if (haverslock) 1087 nfs_rsunlock(np, p); 1088 1089 return (error); 1090} 1091 1092/* 1093 * Get an nfs cache block. 1094 * 1095 * Allocate a new one if the block isn't currently in the cache 1096 * and return the block marked busy. If the calling process is 1097 * interrupted by a signal for an interruptible mount point, return 1098 * NULL. 1099 * 1100 * The caller must carefully deal with the possible B_INVAL state of 1101 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1102 * indirectly), so synchronous reads can be issued without worrying about 1103 * the B_INVAL state. We have to be a little more careful when dealing 1104 * with writes (see comments in nfs_write()) when extending a file past 1105 * its EOF. 1106 / 1107static struct buf 1108nfs_getcacheblk(vp, bn, size, p) 1109 struct vnode vp; 1110* daddr_t bn; 1111 int size; 1112 struct proc p; 1113{ 1114* register struct buf bp; 1115* struct mount mp; 1116* struct nfsmount nmp; 1117* 1118 mp = vp->v_mount; 1119 nmp = VFSTONFS(mp); 1120 1121 if (nmp->nm_flag & NFSMNT_INT) { 1122 bp = getblk(vp, bn, size, PCATCH, 0); 1123 while (bp == (struct buf )0) { 1124* if (nfs_sigintr(nmp, (struct nfsreq )0, p)) 1125* return ((struct buf )0); 1126* bp = getblk(vp, bn, size, 0, 2 * hz); 1127 } 1128 } else { 1129 bp = getblk(vp, bn, size, 0, 0); 1130 } 1131 1132 if (vp->v_type == VREG) { 1133 int biosize; 1134 1135 biosize = mp->mnt_stat.f_iosize; 1136 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1137 } 1138 return (bp); 1139} 1140 1141/* 1142 * Flush and invalidate all dirty buffers. If another process is already 1143 * doing the flush, just wait for completion. 1144 / 1145int 1146nfs_vinvalbuf(vp, flags, cred, p, intrflg) 1147* struct vnode vp; 1148* int flags; 1149 struct ucred cred; 1150* struct proc p; 1151* int intrflg; 1152{ 1153 register struct nfsnode np = VTONFS(vp); 1154* struct nfsmount nmp = VFSTONFS(vp->v_mount); 1155* int error = 0, slpflag, slptimeo; 1156 1157 if (vp->v_flag & VXLOCK) { 1158 return (0); 1159 } 1160 1161 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1162 intrflg = 0; 1163 if (intrflg) { 1164 slpflag = PCATCH; 1165 slptimeo = 2 * hz; 1166 } else { 1167 slpflag = 0; 1168 slptimeo = 0; 1169 } 1170 /* 1171 * First wait for any other process doing a flush to complete. 1172 / 1173* while (np->n_flag & NFLUSHINPROG) { 1174 np->n_flag \|= NFLUSHWANT; 1175 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 1176 slptimeo); 1177 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq )0, p)) 1178* return (EINTR); 1179 } 1180 1181 /* 1182 * Now, flush as required. 1183 / 1184* np->n_flag \|= NFLUSHINPROG; 1185 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 1186 while (error) { 1187 if (intrflg && nfs_sigintr(nmp, (struct nfsreq )0, p)) { 1188* np->n_flag &= ~NFLUSHINPROG; 1189 if (np->n_flag & NFLUSHWANT) { 1190 np->n_flag &= ~NFLUSHWANT; 1191 wakeup((caddr_t)&np->n_flag); 1192 } 1193 return (EINTR); 1194 } 1195 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 1196 } 1197 np->n_flag &= ~(NMODIFIED \| NFLUSHINPROG); 1198 if (np->n_flag & NFLUSHWANT) { 1199 np->n_flag &= ~NFLUSHWANT; 1200 wakeup((caddr_t)&np->n_flag); 1201 } 1202 return (0); 1203} 1204 1205/* 1206 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1207 * This is mainly to avoid queueing async I/O requests when the nfsiods 1208 * are all hung on a dead server. 1209 * 1210 * Note: nfs_asyncio() does not clear (BIO_ERROR\|B_INVAL) but when the bp 1211 * is eventually dequeued by the async daemon, nfs_doio() will. 1212 / 1213int 1214nfs_asyncio(bp, cred, procp) 1215* register struct buf bp; 1216* struct ucred cred; 1217* struct proc procp; 1218{ 1219* struct nfsmount nmp; 1220* int i; 1221 int gotiod; 1222 int slpflag = 0; 1223 int slptimeo = 0; 1224 int error; 1225 1226 /* 1227 * If no async daemons then return EIO to force caller to run the rpc 1228 * synchronously. 1229 / 1230* if (nfs_numasync == 0) 1231 return (EIO); 1232 1233 nmp = VFSTONFS(bp->b_vp->v_mount); 1234 1235 /* 1236 * Commits are usually short and sweet so lets save some cpu and 1237 * leave the async daemons for more important rpc's (such as reads 1238 * and writes). 1239 / 1240* if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1241 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1242 return(EIO); 1243 } 1244 1245again: 1246 if (nmp->nm_flag & NFSMNT_INT) 1247 slpflag = PCATCH; 1248 gotiod = FALSE; 1249 1250 /* 1251 * Find a free iod to process this request. 1252 / 1253* for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 1254 if (nfs_iodwant[i]) { 1255 /* 1256 * Found one, so wake it up and tell it which 1257 * mount to process. 1258 / 1259* NFS_DPF(ASYNCIO, 1260 ("nfs_asyncio: waking iod %d for mount %p\n", 1261 i, nmp)); 1262 nfs_iodwant[i] = (struct proc )0; 1263* nfs_iodmount[i] = nmp; 1264 nmp->nm_bufqiods++; 1265 wakeup((caddr_t)&nfs_iodwant[i]); 1266 gotiod = TRUE; 1267 break; 1268 } 1269 1270 /* 1271 * If none are free, we may already have an iod working on this mount 1272 * point. If so, it will process our request. 1273 / 1274* if (!gotiod) { 1275 if (nmp->nm_bufqiods > 0) { 1276 NFS_DPF(ASYNCIO, 1277 ("nfs_asyncio: %d iods are already processing mount %p\n", 1278 nmp->nm_bufqiods, nmp)); 1279 gotiod = TRUE; 1280 } 1281 } 1282 1283 /* 1284 * If we have an iod which can process the request, then queue 1285 * the buffer. 1286 / 1287* if (gotiod) { 1288 /* 1289 * Ensure that the queue never grows too large. We still want 1290 * to asynchronize so we block rather then return EIO. 1291 / 1292* while (nmp->nm_bufqlen >= 2nfs_numasync) { 1293* NFS_DPF(ASYNCIO, 1294 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1295 nmp->nm_bufqwant = TRUE; 1296 error = tsleep(&nmp->nm_bufq, slpflag \| PRIBIO, 1297 "nfsaio", slptimeo); 1298 if (error) { 1299 if (nfs_sigintr(nmp, NULL, procp)) 1300 return (EINTR); 1301 if (slpflag == PCATCH) { 1302 slpflag = 0; 1303 slptimeo = 2 * hz; 1304 } 1305 } 1306 /* 1307 * We might have lost our iod while sleeping, 1308 * so check and loop if nescessary. 1309 / 1310* if (nmp->nm_bufqiods == 0) { 1311 NFS_DPF(ASYNCIO, 1312 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1313 goto again; 1314 } 1315 } 1316 1317 if (bp->b_iocmd == BIO_READ) { 1318 if (bp->b_rcred == NOCRED && cred != NOCRED) { 1319 crhold(cred); 1320 bp->b_rcred = cred; 1321 } 1322 } else { 1323 bp->b_flags \|= B_WRITEINPROG; 1324 if (bp->b_wcred == NOCRED && cred != NOCRED) { 1325 crhold(cred); 1326 bp->b_wcred = cred; 1327 } 1328 } 1329 1330 BUF_KERNPROC(bp); 1331 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1332 nmp->nm_bufqlen++; 1333 return (0); 1334 } 1335 1336 /* 1337 * All the iods are busy on other mounts, so return EIO to 1338 * force the caller to process the i/o synchronously. 1339 / 1340* NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1341 return (EIO); 1342} 1343 1344/* 1345 * Do an I/O operation to/from a cache block. This may be called 1346 * synchronously or from an nfsiod. 1347 / 1348int 1349nfs_doio(bp, cr, p) 1350* struct buf bp; 1351* struct ucred cr; 1352* struct proc p; 1353{ 1354* struct uio uiop; 1355* struct vnode vp; 1356* struct nfsnode np; 1357* struct nfsmount nmp; 1358* int error = 0, iomode, must_commit = 0; 1359 struct uio uio; 1360 struct iovec io; 1361 1362 vp = bp->b_vp; 1363 np = VTONFS(vp); 1364 nmp = VFSTONFS(vp->v_mount); 1365 uiop = &uio; 1366 uiop->uio_iov = &io; 1367 uiop->uio_iovcnt = 1; 1368 uiop->uio_segflg = UIO_SYSSPACE; 1369 uiop->uio_procp = p; 1370 1371 /* 1372 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1373 * do this here so we do not have to do it in all the code that 1374 * calls us. 1375 / 1376* bp->b_flags &= ~B_INVAL; 1377 bp->b_ioflags &= ~BIO_ERROR; 1378 1379 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1380 1381 /* 1382 * Historically, paging was done with physio, but no more. 1383 / 1384* if (bp->b_flags & B_PHYS) { 1385 /* 1386 * ...though reading /dev/drum still gets us here. 1387 / 1388* io.iov_len = uiop->uio_resid = bp->b_bcount; 1389 /* mapping was done by vmapbuf() / 1390* io.iov_base = bp->b_data; 1391 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1392 if (bp->b_iocmd == BIO_READ) { 1393 uiop->uio_rw = UIO_READ; 1394 nfsstats.read_physios++; 1395 error = nfs_readrpc(vp, uiop, cr); 1396 } else { 1397 int com; 1398 1399 iomode = NFSV3WRITE_DATASYNC; 1400 uiop->uio_rw = UIO_WRITE; 1401 nfsstats.write_physios++; 1402 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 1403 } 1404 if (error) { 1405 bp->b_ioflags \|= BIO_ERROR; 1406 bp->b_error = error; 1407 } 1408 } else if (bp->b_iocmd == BIO_READ) { 1409 io.iov_len = uiop->uio_resid = bp->b_bcount; 1410 io.iov_base = bp->b_data; 1411 uiop->uio_rw = UIO_READ; 1412 switch (vp->v_type) { 1413 case VREG: 1414 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1415 nfsstats.read_bios++; 1416 error = nfs_readrpc(vp, uiop, cr); 1417 if (!error) { 1418 if (uiop->uio_resid) { 1419 /* 1420 * If we had a short read with no error, we must have 1421 * hit a file hole. We should zero-fill the remainder. 1422 * This can also occur if the server hits the file EOF. 1423 * 1424 * Holes used to be able to occur due to pending 1425 * writes, but that is not possible any longer. 1426 / 1427* int nread = bp->b_bcount - uiop->uio_resid; 1428 int left = bp->b_bcount - nread; 1429 1430 if (left > 0) 1431 bzero((char )bp->b_data + nread, left); 1432* uiop->uio_resid = 0; 1433 } 1434 } 1435 if (p && (vp->v_flag & VTEXT) && 1436 (((nmp->nm_flag & NFSMNT_NQNFS) && 1437 NQNFS_CKINVALID(vp, np, ND_READ) && 1438 np->n_lrev != np->n_brev) \|\| 1439 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1440 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 1441 uprintf("Process killed due to text file modification\n"); 1442 psignal(p, SIGKILL); 1443 PHOLD(p); 1444 } 1445 break; 1446 case VLNK: 1447 uiop->uio_offset = (off_t)0; 1448 nfsstats.readlink_bios++; 1449 error = nfs_readlinkrpc(vp, uiop, cr); 1450 break; 1451 case VDIR: 1452 nfsstats.readdir_bios++; 1453 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1454 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1455 error = nfs_readdirplusrpc(vp, uiop, cr); 1456 if (error == NFSERR_NOTSUPP) 1457 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1458 } 1459 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1460 error = nfs_readdirrpc(vp, uiop, cr); 1461 /* 1462 * end-of-directory sets B_INVAL but does not generate an 1463 * error. 1464 / 1465* if (error == 0 && uiop->uio_resid == bp->b_bcount) 1466 bp->b_flags \|= B_INVAL; 1467 break; 1468 default: 1469 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1470 break; 1471 }; 1472 if (error) { 1473 bp->b_ioflags \|= BIO_ERROR; 1474 bp->b_error = error; 1475 } 1476 } else { 1477 /* 1478 * If we only need to commit, try to commit 1479 / 1480* if (bp->b_flags & B_NEEDCOMMIT) { 1481 int retv; 1482 off_t off; 1483 1484 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1485 bp->b_flags \|= B_WRITEINPROG; 1486 retv = nfs_commit( 1487 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1488 bp->b_wcred, p); 1489 bp->b_flags &= ~B_WRITEINPROG; 1490 if (retv == 0) { 1491 bp->b_dirtyoff = bp->b_dirtyend = 0; 1492 bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK); 1493 bp->b_resid = 0; 1494 bufdone(bp); 1495 return (0); 1496 } 1497 if (retv == NFSERR_STALEWRITEVERF) { 1498 nfs_clearcommit(bp->b_vp->v_mount); 1499 } 1500 } 1501 1502 /* 1503 * Setup for actual write 1504 / 1505* 1506 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1507 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1508 1509 if (bp->b_dirtyend > bp->b_dirtyoff) { 1510 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1511 - bp->b_dirtyoff; 1512 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1513 + bp->b_dirtyoff; 1514 io.iov_base = (char )bp->b_data + bp->b_dirtyoff; 1515* uiop->uio_rw = UIO_WRITE; 1516 nfsstats.write_bios++; 1517 1518 if ((bp->b_flags & (B_ASYNC \| B_NEEDCOMMIT \| B_NOCACHE \| B_CLUSTER)) == B_ASYNC) 1519 iomode = NFSV3WRITE_UNSTABLE; 1520 else 1521 iomode = NFSV3WRITE_FILESYNC; 1522 1523 bp->b_flags \|= B_WRITEINPROG; 1524 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1525 1526 /* 1527 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1528 * to cluster the buffers needing commit. This will allow 1529 * the system to submit a single commit rpc for the whole 1530 * cluster. We can do this even if the buffer is not 100% 1531 * dirty (relative to the NFS blocksize), so we optimize the 1532 * append-to-file-case. 1533 * 1534 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1535 * cleared because write clustering only works for commit 1536 * rpc's, not for the data portion of the write). 1537 / 1538* 1539 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1540 bp->b_flags \|= B_NEEDCOMMIT; 1541 if (bp->b_dirtyoff == 0 1542 && bp->b_dirtyend == bp->b_bcount) 1543 bp->b_flags \|= B_CLUSTEROK; 1544 } else { 1545 bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK); 1546 } 1547 bp->b_flags &= ~B_WRITEINPROG; 1548 1549 /* 1550 * For an interrupted write, the buffer is still valid 1551 * and the write hasn't been pushed to the server yet, 1552 * so we can't set BIO_ERROR and report the interruption 1553 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1554 * is not relevant, so the rpc attempt is essentially 1555 * a noop. For the case of a V3 write rpc not being 1556 * committed to stable storage, the block is still 1557 * dirty and requires either a commit rpc or another 1558 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1559 * the block is reused. This is indicated by setting 1560 * the B_DELWRI and B_NEEDCOMMIT flags. 1561 * 1562 * If the buffer is marked B_PAGING, it does not reside on 1563 * the vp's paging queues so we cannot call bdirty(). The 1564 * bp in this case is not an NFS cache block so we should 1565 * be safe. XXX 1566 / 1567* if (error == EINTR 1568 \|\| (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1569 int s; 1570 1571 s = splbio(); 1572 bp->b_flags &= ~(B_INVAL\|B_NOCACHE); 1573 if ((bp->b_flags & B_PAGING) == 0) { 1574 bdirty(bp); 1575 bp->b_flags &= ~B_DONE; 1576 } 1577 if (error && (bp->b_flags & B_ASYNC) == 0) 1578 bp->b_flags \|= B_EINTR; 1579 splx(s); 1580 } else { 1581 if (error) { 1582 bp->b_ioflags \|= BIO_ERROR; 1583 bp->b_error = np->n_error = error; 1584 np->n_flag \|= NWRITEERR; 1585 } 1586 bp->b_dirtyoff = bp->b_dirtyend = 0; 1587 } 1588 } else { 1589 bp->b_resid = 0; 1590 bufdone(bp); 1591 return (0); 1592 } 1593 } 1594 bp->b_resid = uiop->uio_resid; 1595 if (must_commit) 1596 nfs_clearcommit(vp->v_mount); 1597 bufdone(bp); 1598 return (error); 1599}	47#include <sys/buf.h> 48#include <sys/vnode.h> 49#include <sys/mount.h> 50#include <sys/kernel.h> 51 52#include <vm/vm.h> 53#include <vm/vm_extern.h> 54#include <vm/vm_page.h> 55#include <vm/vm_object.h> 56#include <vm/vm_pager.h> 57#include <vm/vnode_pager.h> 58 59#include <nfs/rpcv2.h> 60#include <nfs/nfsproto.h> 61#include <nfs/nfs.h> 62#include <nfs/nfsmount.h> 63#include <nfs/nqnfs.h> 64#include <nfs/nfsnode.h> 65 66static struct buf nfs_getcacheblk __P((struct vnode vp, daddr_t bn, int size, 67 struct proc p)); 68 69extern int nfs_numasync; 70extern int nfs_pbuf_freecnt; 71extern struct nfsstats nfsstats; 72 73/ 74 * Vnode op for VM getpages. 75 / 76int 77nfs_getpages(ap) 78 struct vop_getpages_args / { 79 struct vnode a_vp; 80 vm_page_t a_m; 81 int a_count; 82 int a_reqpage; 83 vm_ooffset_t a_offset; 84 } / ap; 85{ 86 int i, error, nextoff, size, toff, count, npages; 87 struct uio uio; 88 struct iovec iov; 89 vm_offset_t kva; 90 struct buf bp; 91 struct vnode vp; 92 struct proc p; 93 struct ucred cred; 94 struct nfsmount nmp; 95 vm_page_t pages; 96 97 vp = ap->a_vp; 98 p = curproc; /* XXX / 99 cred = curproc->p_ucred; / XXX / 100* nmp = VFSTONFS(vp->v_mount); 101 pages = ap->a_m; 102 count = ap->a_count; 103 104 if (vp->v_object == NULL) { 105 printf("nfs_getpages: called with non-merged cache vnode??\n"); 106 return VM_PAGER_ERROR; 107 } 108 109 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 110 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 111 (void)nfs_fsinfo(nmp, vp, cred, p); 112 113 npages = btoc(count); 114 115 /* 116 * If the requested page is partially valid, just return it and 117 * allow the pager to zero-out the blanks. Partially valid pages 118 * can only occur at the file EOF. 119 / 120* 121 { 122 vm_page_t m = pages[ap->a_reqpage]; 123 124 if (m->valid != 0) { 125 /* handled by vm_fault now / 126* /* vm_page_zero_invalid(m, TRUE); / 127* for (i = 0; i < npages; ++i) { 128 if (i != ap->a_reqpage) 129 vnode_pager_freepage(pages[i]); 130 } 131 return(0); 132 } 133 } 134 135 /* 136 * We use only the kva address for the buffer, but this is extremely 137 * convienient and fast. 138 / 139* bp = getpbuf(&nfs_pbuf_freecnt); 140 141 kva = (vm_offset_t) bp->b_data; 142 pmap_qenter(kva, pages, npages); 143 144 iov.iov_base = (caddr_t) kva; 145 iov.iov_len = count; 146 uio.uio_iov = &iov; 147 uio.uio_iovcnt = 1; 148 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 149 uio.uio_resid = count; 150 uio.uio_segflg = UIO_SYSSPACE; 151 uio.uio_rw = UIO_READ; 152 uio.uio_procp = p; 153 154 error = nfs_readrpc(vp, &uio, cred); 155 pmap_qremove(kva, npages); 156 157 relpbuf(bp, &nfs_pbuf_freecnt); 158 159 if (error && (uio.uio_resid == count)) { 160 printf("nfs_getpages: error %d\n", error); 161 for (i = 0; i < npages; ++i) { 162 if (i != ap->a_reqpage) 163 vnode_pager_freepage(pages[i]); 164 } 165 return VM_PAGER_ERROR; 166 } 167 168 /* 169 * Calculate the number of bytes read and validate only that number 170 * of bytes. Note that due to pending writes, size may be 0. This 171 * does not mean that the remaining data is invalid! 172 / 173* 174 size = count - uio.uio_resid; 175 176 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 177 vm_page_t m; 178 nextoff = toff + PAGE_SIZE; 179 m = pages[i]; 180 181 m->flags &= ~PG_ZERO; 182 183 if (nextoff <= size) { 184 /* 185 * Read operation filled an entire page 186 / 187* m->valid = VM_PAGE_BITS_ALL; 188 vm_page_undirty(m); 189 } else if (size > toff) { 190 /* 191 * Read operation filled a partial page. 192 / 193* m->valid = 0; 194 vm_page_set_validclean(m, 0, size - toff); 195 /* handled by vm_fault now / 196* /* vm_page_zero_invalid(m, TRUE); / 197* } 198 199 if (i != ap->a_reqpage) { 200 /* 201 * Whether or not to leave the page activated is up in 202 * the air, but we should put the page on a page queue 203 * somewhere (it already is in the object). Result: 204 * It appears that emperical results show that 205 * deactivating pages is best. 206 / 207* 208 /* 209 * Just in case someone was asking for this page we 210 * now tell them that it is ok to use. 211 / 212* if (!error) { 213 if (m->flags & PG_WANTED) 214 vm_page_activate(m); 215 else 216 vm_page_deactivate(m); 217 vm_page_wakeup(m); 218 } else { 219 vnode_pager_freepage(m); 220 } 221 } 222 } 223 return 0; 224} 225 226/* 227 * Vnode op for VM putpages. 228 / 229int 230nfs_putpages(ap) 231* struct vop_putpages_args /* { 232 struct vnode a_vp; 233* vm_page_t a_m; 234* int a_count; 235 int a_sync; 236 int a_rtvals; 237* vm_ooffset_t a_offset; 238 } / ap; 239{ 240 struct uio uio; 241 struct iovec iov; 242 vm_offset_t kva; 243 struct buf bp; 244* int iomode, must_commit, i, error, npages, count; 245 off_t offset; 246 int rtvals; 247* struct vnode vp; 248* struct proc p; 249* struct ucred cred; 250* struct nfsmount nmp; 251* struct nfsnode np; 252* vm_page_t pages; 253* 254 vp = ap->a_vp; 255 np = VTONFS(vp); 256 p = curproc; /* XXX / 257* cred = curproc->p_ucred; /* XXX / 258* nmp = VFSTONFS(vp->v_mount); 259 pages = ap->a_m; 260 count = ap->a_count; 261 rtvals = ap->a_rtvals; 262 npages = btoc(count); 263 offset = IDX_TO_OFF(pages[0]->pindex); 264 265 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 266 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 267 (void)nfs_fsinfo(nmp, vp, cred, p); 268 269 for (i = 0; i < npages; i++) { 270 rtvals[i] = VM_PAGER_AGAIN; 271 } 272 273 /* 274 * When putting pages, do not extend file past EOF. 275 / 276* 277 if (offset + count > np->n_size) { 278 count = np->n_size - offset; 279 if (count < 0) 280 count = 0; 281 } 282 283 /* 284 * We use only the kva address for the buffer, but this is extremely 285 * convienient and fast. 286 / 287* bp = getpbuf(&nfs_pbuf_freecnt); 288 289 kva = (vm_offset_t) bp->b_data; 290 pmap_qenter(kva, pages, npages); 291 292 iov.iov_base = (caddr_t) kva; 293 iov.iov_len = count; 294 uio.uio_iov = &iov; 295 uio.uio_iovcnt = 1; 296 uio.uio_offset = offset; 297 uio.uio_resid = count; 298 uio.uio_segflg = UIO_SYSSPACE; 299 uio.uio_rw = UIO_WRITE; 300 uio.uio_procp = p; 301 302 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 303 iomode = NFSV3WRITE_UNSTABLE; 304 else 305 iomode = NFSV3WRITE_FILESYNC; 306 307 error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); 308 309 pmap_qremove(kva, npages); 310 relpbuf(bp, &nfs_pbuf_freecnt); 311 312 if (!error) { 313 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 314 for (i = 0; i < nwritten; i++) { 315 rtvals[i] = VM_PAGER_OK; 316 vm_page_undirty(pages[i]); 317 } 318 if (must_commit) 319 nfs_clearcommit(vp->v_mount); 320 } 321 return rtvals[0]; 322} 323 324/* 325 * Vnode op for read using bio 326 / 327int 328nfs_bioread(vp, uio, ioflag, cred) 329* register struct vnode vp; 330* register struct uio uio; 331* int ioflag; 332 struct ucred cred; 333{ 334* register struct nfsnode np = VTONFS(vp); 335* register int biosize, i; 336 struct buf bp = 0, rabp; 337 struct vattr vattr; 338 struct proc p; 339* struct nfsmount nmp = VFSTONFS(vp->v_mount); 340* daddr_t lbn, rabn; 341 int bcount; 342 int seqcount; 343 int nra, error = 0, n = 0, on = 0; 344 345#ifdef DIAGNOSTIC 346 if (uio->uio_rw != UIO_READ) 347 panic("nfs_read mode"); 348#endif 349 if (uio->uio_resid == 0) 350 return (0); 351 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative / 352* return (EINVAL); 353 p = uio->uio_procp; 354 355 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 356 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 357 (void)nfs_fsinfo(nmp, vp, cred, p); 358 if (vp->v_type != VDIR && 359 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 360 return (EFBIG); 361 biosize = vp->v_mount->mnt_stat.f_iosize; 362 seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE); 363 /* 364 * For nfs, cache consistency can only be maintained approximately. 365 * Although RFC1094 does not specify the criteria, the following is 366 * believed to be compatible with the reference port. 367 * For nqnfs, full cache consistency is maintained within the loop. 368 * For nfs: 369 * If the file's modify time on the server has changed since the 370 * last read rpc or you have written to the file, 371 * you may have lost data cache consistency with the 372 * server, so flush all of the file's data out of the cache. 373 * Then force a getattr rpc to ensure that you have up to date 374 * attributes. 375 * NB: This implies that cache data can be read when up to 376 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 377 * attributes this could be forced by setting n_attrstamp to 0 before 378 * the VOP_GETATTR() call. 379 / 380* if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 381 if (np->n_flag & NMODIFIED) { 382 if (vp->v_type != VREG) { 383 if (vp->v_type != VDIR) 384 panic("nfs: bioread, not dir"); 385 nfs_invaldir(vp); 386 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 387 if (error) 388 return (error); 389 } 390 np->n_attrstamp = 0; 391 error = VOP_GETATTR(vp, &vattr, cred, p); 392 if (error) 393 return (error); 394 np->n_mtime = vattr.va_mtime.tv_sec; 395 } else { 396 error = VOP_GETATTR(vp, &vattr, cred, p); 397 if (error) 398 return (error); 399 if (np->n_mtime != vattr.va_mtime.tv_sec) { 400 if (vp->v_type == VDIR) 401 nfs_invaldir(vp); 402 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 403 if (error) 404 return (error); 405 np->n_mtime = vattr.va_mtime.tv_sec; 406 } 407 } 408 } 409 do { 410 411 /* 412 * Get a valid lease. If cached data is stale, flush it. 413 / 414* if (nmp->nm_flag & NFSMNT_NQNFS) { 415 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 416 do { 417 error = nqnfs_getlease(vp, ND_READ, cred, p); 418 } while (error == NQNFS_EXPIRED); 419 if (error) 420 return (error); 421 if (np->n_lrev != np->n_brev \|\| 422 (np->n_flag & NQNFSNONCACHE) \|\| 423 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 424 if (vp->v_type == VDIR) 425 nfs_invaldir(vp); 426 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 427 if (error) 428 return (error); 429 np->n_brev = np->n_lrev; 430 } 431 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 432 nfs_invaldir(vp); 433 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 434 if (error) 435 return (error); 436 } 437 } 438 if (np->n_flag & NQNFSNONCACHE) { 439 switch (vp->v_type) { 440 case VREG: 441 return (nfs_readrpc(vp, uio, cred)); 442 case VLNK: 443 return (nfs_readlinkrpc(vp, uio, cred)); 444 case VDIR: 445 break; 446 default: 447 printf(" NQNFSNONCACHE: type %x unexpected\n", 448 vp->v_type); 449 }; 450 } 451 switch (vp->v_type) { 452 case VREG: 453 nfsstats.biocache_reads++; 454 lbn = uio->uio_offset / biosize; 455 on = uio->uio_offset & (biosize - 1); 456 457 /* 458 * Start the read ahead(s), as required. 459 / 460* if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 461 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 462 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 463 rabn = lbn + 1 + nra; 464 if (!incore(vp, rabn)) { 465 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 466 if (!rabp) 467 return (EINTR); 468 if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) { 469 rabp->b_flags \|= B_ASYNC; 470 rabp->b_iocmd = BIO_READ; 471 vfs_busy_pages(rabp, 0); 472 if (nfs_asyncio(rabp, cred, p)) { 473 rabp->b_flags \|= B_INVAL; 474 rabp->b_ioflags \|= BIO_ERROR; 475 vfs_unbusy_pages(rabp); 476 brelse(rabp); 477 break; 478 } 479 } else { 480 brelse(rabp); 481 } 482 } 483 } 484 } 485 486 /* 487 * Obtain the buffer cache block. Figure out the buffer size 488 * when we are at EOF. If we are modifying the size of the 489 * buffer based on an EOF condition we need to hold 490 * nfs_rslock() through obtaining the buffer to prevent 491 * a potential writer-appender from messing with n_size. 492 * Otherwise we may accidently truncate the buffer and 493 * lose dirty data. 494 * 495 * Note that bcount is not DEV_BSIZE aligned. 496 / 497* 498again: 499 bcount = biosize; 500 if ((off_t)lbn * biosize >= np->n_size) { 501 bcount = 0; 502 } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 503 bcount = np->n_size - (off_t)lbn * biosize; 504 } 505 if (bcount != biosize) { 506 switch(nfs_rslock(np, p)) { 507 case ENOLCK: 508 goto again; 509 /* not reached / 510* case EINTR: 511 case ERESTART: 512 return(EINTR); 513 /* not reached / 514* default: 515 break; 516 } 517 } 518 519 bp = nfs_getcacheblk(vp, lbn, bcount, p); 520 521 if (bcount != biosize) 522 nfs_rsunlock(np, p); 523 if (!bp) 524 return (EINTR); 525 526 /* 527 * If B_CACHE is not set, we must issue the read. If this 528 * fails, we return an error. 529 / 530* 531 if ((bp->b_flags & B_CACHE) == 0) { 532 bp->b_iocmd = BIO_READ; 533 vfs_busy_pages(bp, 0); 534 error = nfs_doio(bp, cred, p); 535 if (error) { 536 brelse(bp); 537 return (error); 538 } 539 } 540 541 /* 542 * on is the offset into the current bp. Figure out how many 543 * bytes we can copy out of the bp. Note that bcount is 544 * NOT DEV_BSIZE aligned. 545 * 546 * Then figure out how many bytes we can copy into the uio. 547 / 548* 549 n = 0; 550 if (on < bcount) 551 n = min((unsigned)(bcount - on), uio->uio_resid); 552 break; 553 case VLNK: 554 nfsstats.biocache_readlinks++; 555 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 556 if (!bp) 557 return (EINTR); 558 if ((bp->b_flags & B_CACHE) == 0) { 559 bp->b_iocmd = BIO_READ; 560 vfs_busy_pages(bp, 0); 561 error = nfs_doio(bp, cred, p); 562 if (error) { 563 bp->b_ioflags \|= BIO_ERROR; 564 brelse(bp); 565 return (error); 566 } 567 } 568 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 569 on = 0; 570 break; 571 case VDIR: 572 nfsstats.biocache_readdirs++; 573 if (np->n_direofoffset 574 && uio->uio_offset >= np->n_direofoffset) { 575 return (0); 576 } 577 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 578 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 579 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 580 if (!bp) 581 return (EINTR); 582 if ((bp->b_flags & B_CACHE) == 0) { 583 bp->b_iocmd = BIO_READ; 584 vfs_busy_pages(bp, 0); 585 error = nfs_doio(bp, cred, p); 586 if (error) { 587 brelse(bp); 588 } 589 while (error == NFSERR_BAD_COOKIE) { 590 printf("got bad cookie vp %p bp %p\n", vp, bp); 591 nfs_invaldir(vp); 592 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 593 /* 594 * Yuck! The directory has been modified on the 595 * server. The only way to get the block is by 596 * reading from the beginning to get all the 597 * offset cookies. 598 * 599 * Leave the last bp intact unless there is an error. 600 * Loop back up to the while if the error is another 601 * NFSERR_BAD_COOKIE (double yuch!). 602 / 603* for (i = 0; i <= lbn && !error; i++) { 604 if (np->n_direofoffset 605 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 606 return (0); 607 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 608 if (!bp) 609 return (EINTR); 610 if ((bp->b_flags & B_CACHE) == 0) { 611 bp->b_iocmd = BIO_READ; 612 vfs_busy_pages(bp, 0); 613 error = nfs_doio(bp, cred, p); 614 /* 615 * no error + B_INVAL == directory EOF, 616 * use the block. 617 / 618* if (error == 0 && (bp->b_flags & B_INVAL)) 619 break; 620 } 621 /* 622 * An error will throw away the block and the 623 * for loop will break out. If no error and this 624 * is not the block we want, we throw away the 625 * block and go for the next one via the for loop. 626 / 627* if (error \|\| i < lbn) 628 brelse(bp); 629 } 630 } 631 /* 632 * The above while is repeated if we hit another cookie 633 * error. If we hit an error and it wasn't a cookie error, 634 * we give up. 635 / 636* if (error) 637 return (error); 638 } 639 640 /* 641 * If not eof and read aheads are enabled, start one. 642 * (You need the current block first, so that you have the 643 * directory offset cookie of the next block.) 644 / 645* if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 646 (bp->b_flags & B_INVAL) == 0 && 647 (np->n_direofoffset == 0 \|\| 648 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 649 !(np->n_flag & NQNFSNONCACHE) && 650 !incore(vp, lbn + 1)) { 651 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 652 if (rabp) { 653 if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) { 654 rabp->b_flags \|= B_ASYNC; 655 rabp->b_iocmd = BIO_READ; 656 vfs_busy_pages(rabp, 0); 657 if (nfs_asyncio(rabp, cred, p)) { 658 rabp->b_flags \|= B_INVAL; 659 rabp->b_ioflags \|= BIO_ERROR; 660 vfs_unbusy_pages(rabp); 661 brelse(rabp); 662 } 663 } else { 664 brelse(rabp); 665 } 666 } 667 } 668 /* 669 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 670 * chopped for the EOF condition, we cannot tell how large 671 * NFS directories are going to be until we hit EOF. So 672 * an NFS directory buffer is not chopped to its EOF. Now, 673 * it just so happens that b_resid will effectively chop it 674 * to EOF. BUT this information is lost if the buffer goes 675 * away and is reconstituted into a B_CACHE state ( due to 676 * being VMIO ) later. So we keep track of the directory eof 677 * in np->n_direofoffset and chop it off as an extra step 678 * right here. 679 / 680* n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 681 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 682 n = np->n_direofoffset - uio->uio_offset; 683 break; 684 default: 685 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 686 break; 687 }; 688 689 if (n > 0) { 690 error = uiomove(bp->b_data + on, (int)n, uio); 691 } 692 switch (vp->v_type) { 693 case VREG: 694 break; 695 case VLNK: 696 n = 0; 697 break; 698 case VDIR: 699 /* 700 * Invalidate buffer if caching is disabled, forcing a 701 * re-read from the remote later. 702 / 703* if (np->n_flag & NQNFSNONCACHE) 704 bp->b_flags \|= B_INVAL; 705 break; 706 default: 707 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 708 } 709 brelse(bp); 710 } while (error == 0 && uio->uio_resid > 0 && n > 0); 711 return (error); 712} 713 714/* 715 * Vnode op for write using bio 716 / 717int 718nfs_write(ap) 719* struct vop_write_args /* { 720 struct vnode a_vp; 721* struct uio a_uio; 722* int a_ioflag; 723 struct ucred a_cred; 724* } / ap; 725{ 726 int biosize; 727 struct uio uio = ap->a_uio; 728* struct proc p = uio->uio_procp; 729* struct vnode vp = ap->a_vp; 730* struct nfsnode np = VTONFS(vp); 731* struct ucred cred = ap->a_cred; 732* int ioflag = ap->a_ioflag; 733 struct buf bp; 734* struct vattr vattr; 735 struct nfsmount nmp = VFSTONFS(vp->v_mount); 736* daddr_t lbn; 737 int bcount; 738 int n, on, error = 0, iomode, must_commit; 739 int haverslock = 0; 740 741#ifdef DIAGNOSTIC 742 if (uio->uio_rw != UIO_WRITE) 743 panic("nfs_write mode"); 744 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 745 panic("nfs_write proc"); 746#endif 747 if (vp->v_type != VREG) 748 return (EIO); 749 if (np->n_flag & NWRITEERR) { 750 np->n_flag &= ~NWRITEERR; 751 return (np->n_error); 752 } 753 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 754 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 755 (void)nfs_fsinfo(nmp, vp, cred, p); 756 757 /* 758 * Synchronously flush pending buffers if we are in synchronous 759 * mode or if we are appending. 760 / 761* if (ioflag & (IO_APPEND \| IO_SYNC)) { 762 if (np->n_flag & NMODIFIED) { 763 np->n_attrstamp = 0; 764 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 765 if (error) 766 return (error); 767 } 768 } 769 770 /* 771 * If IO_APPEND then load uio_offset. We restart here if we cannot 772 * get the append lock. 773 / 774restart: 775* if (ioflag & IO_APPEND) { 776 np->n_attrstamp = 0; 777 error = VOP_GETATTR(vp, &vattr, cred, p); 778 if (error) 779 return (error); 780 uio->uio_offset = np->n_size; 781 } 782 783 if (uio->uio_offset < 0) 784 return (EINVAL); 785 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 786 return (EFBIG); 787 if (uio->uio_resid == 0) 788 return (0); 789 790 /* 791 * We need to obtain the rslock if we intend to modify np->n_size 792 * in order to guarentee the append point with multiple contending 793 * writers, to guarentee that no other appenders modify n_size 794 * while we are trying to obtain a truncated buffer (i.e. to avoid 795 * accidently truncating data written by another appender due to 796 * the race), and to ensure that the buffer is populated prior to 797 * our extending of the file. We hold rslock through the entire 798 * operation. 799 * 800 * Note that we do not synchronize the case where someone truncates 801 * the file while we are appending to it because attempting to lock 802 * this case may deadlock other parts of the system unexpectedly. 803 / 804* if ((ioflag & IO_APPEND) \|\| 805 uio->uio_offset + uio->uio_resid > np->n_size) { 806 switch(nfs_rslock(np, p)) { 807 case ENOLCK: 808 goto restart; 809 /* not reached / 810* case EINTR: 811 case ERESTART: 812 return(EINTR); 813 /* not reached / 814* default: 815 break; 816 } 817 haverslock = 1; 818 } 819 820 /* 821 * Maybe this should be above the vnode op call, but so long as 822 * file servers have no limits, i don't think it matters 823 / 824* if (p && uio->uio_offset + uio->uio_resid > 825 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 826 psignal(p, SIGXFSZ); 827 if (haverslock) 828 nfs_rsunlock(np, p); 829 return (EFBIG); 830 } 831 832 biosize = vp->v_mount->mnt_stat.f_iosize; 833 834 do { 835 /* 836 * Check for a valid write lease. 837 / 838* if ((nmp->nm_flag & NFSMNT_NQNFS) && 839 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 840 do { 841 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 842 } while (error == NQNFS_EXPIRED); 843 if (error) 844 break; 845 if (np->n_lrev != np->n_brev \|\| 846 (np->n_flag & NQNFSNONCACHE)) { 847 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 848 if (error) 849 break; 850 np->n_brev = np->n_lrev; 851 } 852 } 853 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 854 iomode = NFSV3WRITE_FILESYNC; 855 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 856 if (must_commit) 857 nfs_clearcommit(vp->v_mount); 858 break; 859 } 860 nfsstats.biocache_writes++; 861 lbn = uio->uio_offset / biosize; 862 on = uio->uio_offset & (biosize-1); 863 n = min((unsigned)(biosize - on), uio->uio_resid); 864again: 865 /* 866 * Handle direct append and file extension cases, calculate 867 * unaligned buffer size. 868 / 869* 870 if (uio->uio_offset == np->n_size && n) { 871 /* 872 * Get the buffer (in its pre-append state to maintain 873 * B_CACHE if it was previously set). Resize the 874 * nfsnode after we have locked the buffer to prevent 875 * readers from reading garbage. 876 / 877* bcount = on; 878 bp = nfs_getcacheblk(vp, lbn, bcount, p); 879 880 if (bp != NULL) { 881 long save; 882 883 np->n_size = uio->uio_offset + n; 884 np->n_flag \|= NMODIFIED; 885 vnode_pager_setsize(vp, np->n_size); 886 887 save = bp->b_flags & B_CACHE; 888 bcount += n; 889 allocbuf(bp, bcount); 890 bp->b_flags \|= save; 891 } 892 } else { 893 /* 894 * Obtain the locked cache block first, and then 895 * adjust the file's size as appropriate. 896 / 897* bcount = on + n; 898 if ((off_t)lbn * biosize + bcount < np->n_size) { 899 if ((off_t)(lbn + 1) * biosize < np->n_size) 900 bcount = biosize; 901 else 902 bcount = np->n_size - (off_t)lbn * biosize; 903 } 904 905 bp = nfs_getcacheblk(vp, lbn, bcount, p); 906 907 if (uio->uio_offset + n > np->n_size) { 908 np->n_size = uio->uio_offset + n; 909 np->n_flag \|= NMODIFIED; 910 vnode_pager_setsize(vp, np->n_size); 911 } 912 } 913 914 if (!bp) { 915 error = EINTR; 916 break; 917 } 918 919 /* 920 * Issue a READ if B_CACHE is not set. In special-append 921 * mode, B_CACHE is based on the buffer prior to the write 922 * op and is typically set, avoiding the read. If a read 923 * is required in special append mode, the server will 924 * probably send us a short-read since we extended the file 925 * on our end, resulting in b_resid == 0 and, thusly, 926 * B_CACHE getting set. 927 * 928 * We can also avoid issuing the read if the write covers 929 * the entire buffer. We have to make sure the buffer state 930 * is reasonable in this case since we will not be initiating 931 * I/O. See the comments in kern/vfs_bio.c's getblk() for 932 * more information. 933 * 934 * B_CACHE may also be set due to the buffer being cached 935 * normally. 936 / 937* 938 if (on == 0 && n == bcount) { 939 bp->b_flags \|= B_CACHE; 940 bp->b_flags &= ~B_INVAL; 941 bp->b_ioflags &= ~BIO_ERROR; 942 } 943 944 if ((bp->b_flags & B_CACHE) == 0) { 945 bp->b_iocmd = BIO_READ; 946 vfs_busy_pages(bp, 0); 947 error = nfs_doio(bp, cred, p); 948 if (error) { 949 brelse(bp); 950 break; 951 } 952 } 953 if (!bp) { 954 error = EINTR; 955 break; 956 } 957 if (bp->b_wcred == NOCRED) { 958 crhold(cred); 959 bp->b_wcred = cred; 960 } 961 np->n_flag \|= NMODIFIED; 962 963 /* 964 * If dirtyend exceeds file size, chop it down. This should 965 * not normally occur but there is an append race where it 966 * might occur XXX, so we log it. 967 * 968 * If the chopping creates a reverse-indexed or degenerate 969 * situation with dirtyoff/end, we 0 both of them. 970 / 971* 972 if (bp->b_dirtyend > bcount) { 973 printf("NFS append race @%lx:%d\n", 974 (long)bp->b_blkno * DEV_BSIZE, 975 bp->b_dirtyend - bcount); 976 bp->b_dirtyend = bcount; 977 } 978 979 if (bp->b_dirtyoff >= bp->b_dirtyend) 980 bp->b_dirtyoff = bp->b_dirtyend = 0; 981 982 /* 983 * If the new write will leave a contiguous dirty 984 * area, just update the b_dirtyoff and b_dirtyend, 985 * otherwise force a write rpc of the old dirty area. 986 * 987 * While it is possible to merge discontiguous writes due to 988 * our having a B_CACHE buffer ( and thus valid read data 989 * for the hole), we don't because it could lead to 990 * significant cache coherency problems with multiple clients, 991 * especially if locking is implemented later on. 992 * 993 * as an optimization we could theoretically maintain 994 * a linked list of discontinuous areas, but we would still 995 * have to commit them separately so there isn't much 996 * advantage to it except perhaps a bit of asynchronization. 997 / 998* 999 if (bp->b_dirtyend > 0 && 1000 (on > bp->b_dirtyend \|\| (on + n) < bp->b_dirtyoff)) { 1001 if (BUF_WRITE(bp) == EINTR) 1002 return (EINTR); 1003 goto again; 1004 } 1005 1006 /* 1007 * Check for valid write lease and get one as required. 1008 * In case getblk() and/or bwrite() delayed us. 1009 / 1010* if ((nmp->nm_flag & NFSMNT_NQNFS) && 1011 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 1012 do { 1013 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 1014 } while (error == NQNFS_EXPIRED); 1015 if (error) { 1016 brelse(bp); 1017 break; 1018 } 1019 if (np->n_lrev != np->n_brev \|\| 1020 (np->n_flag & NQNFSNONCACHE)) { 1021 brelse(bp); 1022 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1023 if (error) 1024 break; 1025 np->n_brev = np->n_lrev; 1026 goto again; 1027 } 1028 } 1029 1030 error = uiomove((char )bp->b_data + on, n, uio); 1031* 1032 /* 1033 * Since this block is being modified, it must be written 1034 * again and not just committed. Since write clustering does 1035 * not work for the stage 1 data write, only the stage 2 1036 * commit rpc, we have to clear B_CLUSTEROK as well. 1037 / 1038* bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK); 1039 1040 if (error) { 1041 bp->b_ioflags \|= BIO_ERROR; 1042 brelse(bp); 1043 break; 1044 } 1045 1046 /* 1047 * Only update dirtyoff/dirtyend if not a degenerate 1048 * condition. 1049 / 1050* if (n) { 1051 if (bp->b_dirtyend > 0) { 1052 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 1053 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 1054 } else { 1055 bp->b_dirtyoff = on; 1056 bp->b_dirtyend = on + n; 1057 } 1058 vfs_bio_set_validclean(bp, on, n); 1059 } 1060 1061 /* 1062 * If the lease is non-cachable or IO_SYNC do bwrite(). 1063 * 1064 * IO_INVAL appears to be unused. The idea appears to be 1065 * to turn off caching in this case. Very odd. XXX 1066 / 1067* if ((np->n_flag & NQNFSNONCACHE) \|\| (ioflag & IO_SYNC)) { 1068 if (ioflag & IO_INVAL) 1069 bp->b_flags \|= B_NOCACHE; 1070 error = BUF_WRITE(bp); 1071 if (error) 1072 break; 1073 if (np->n_flag & NQNFSNONCACHE) { 1074 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1075 if (error) 1076 break; 1077 } 1078 } else if ((n + on) == biosize && 1079 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 1080 bp->b_flags \|= B_ASYNC; 1081 (void)nfs_writebp(bp, 0, 0); 1082 } else { 1083 bdwrite(bp); 1084 } 1085 } while (uio->uio_resid > 0 && n > 0); 1086 1087 if (haverslock) 1088 nfs_rsunlock(np, p); 1089 1090 return (error); 1091} 1092 1093/* 1094 * Get an nfs cache block. 1095 * 1096 * Allocate a new one if the block isn't currently in the cache 1097 * and return the block marked busy. If the calling process is 1098 * interrupted by a signal for an interruptible mount point, return 1099 * NULL. 1100 * 1101 * The caller must carefully deal with the possible B_INVAL state of 1102 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1103 * indirectly), so synchronous reads can be issued without worrying about 1104 * the B_INVAL state. We have to be a little more careful when dealing 1105 * with writes (see comments in nfs_write()) when extending a file past 1106 * its EOF. 1107 / 1108static struct buf 1109nfs_getcacheblk(vp, bn, size, p) 1110 struct vnode vp; 1111* daddr_t bn; 1112 int size; 1113 struct proc p; 1114{ 1115* register struct buf bp; 1116* struct mount mp; 1117* struct nfsmount nmp; 1118* 1119 mp = vp->v_mount; 1120 nmp = VFSTONFS(mp); 1121 1122 if (nmp->nm_flag & NFSMNT_INT) { 1123 bp = getblk(vp, bn, size, PCATCH, 0); 1124 while (bp == (struct buf )0) { 1125* if (nfs_sigintr(nmp, (struct nfsreq )0, p)) 1126* return ((struct buf )0); 1127* bp = getblk(vp, bn, size, 0, 2 * hz); 1128 } 1129 } else { 1130 bp = getblk(vp, bn, size, 0, 0); 1131 } 1132 1133 if (vp->v_type == VREG) { 1134 int biosize; 1135 1136 biosize = mp->mnt_stat.f_iosize; 1137 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1138 } 1139 return (bp); 1140} 1141 1142/* 1143 * Flush and invalidate all dirty buffers. If another process is already 1144 * doing the flush, just wait for completion. 1145 / 1146int 1147nfs_vinvalbuf(vp, flags, cred, p, intrflg) 1148* struct vnode vp; 1149* int flags; 1150 struct ucred cred; 1151* struct proc p; 1152* int intrflg; 1153{ 1154 register struct nfsnode np = VTONFS(vp); 1155* struct nfsmount nmp = VFSTONFS(vp->v_mount); 1156* int error = 0, slpflag, slptimeo; 1157 1158 if (vp->v_flag & VXLOCK) { 1159 return (0); 1160 } 1161 1162 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1163 intrflg = 0; 1164 if (intrflg) { 1165 slpflag = PCATCH; 1166 slptimeo = 2 * hz; 1167 } else { 1168 slpflag = 0; 1169 slptimeo = 0; 1170 } 1171 /* 1172 * First wait for any other process doing a flush to complete. 1173 / 1174* while (np->n_flag & NFLUSHINPROG) { 1175 np->n_flag \|= NFLUSHWANT; 1176 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 1177 slptimeo); 1178 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq )0, p)) 1179* return (EINTR); 1180 } 1181 1182 /* 1183 * Now, flush as required. 1184 / 1185* np->n_flag \|= NFLUSHINPROG; 1186 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 1187 while (error) { 1188 if (intrflg && nfs_sigintr(nmp, (struct nfsreq )0, p)) { 1189* np->n_flag &= ~NFLUSHINPROG; 1190 if (np->n_flag & NFLUSHWANT) { 1191 np->n_flag &= ~NFLUSHWANT; 1192 wakeup((caddr_t)&np->n_flag); 1193 } 1194 return (EINTR); 1195 } 1196 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 1197 } 1198 np->n_flag &= ~(NMODIFIED \| NFLUSHINPROG); 1199 if (np->n_flag & NFLUSHWANT) { 1200 np->n_flag &= ~NFLUSHWANT; 1201 wakeup((caddr_t)&np->n_flag); 1202 } 1203 return (0); 1204} 1205 1206/* 1207 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1208 * This is mainly to avoid queueing async I/O requests when the nfsiods 1209 * are all hung on a dead server. 1210 * 1211 * Note: nfs_asyncio() does not clear (BIO_ERROR\|B_INVAL) but when the bp 1212 * is eventually dequeued by the async daemon, nfs_doio() will. 1213 / 1214int 1215nfs_asyncio(bp, cred, procp) 1216* register struct buf bp; 1217* struct ucred cred; 1218* struct proc procp; 1219{ 1220* struct nfsmount nmp; 1221* int i; 1222 int gotiod; 1223 int slpflag = 0; 1224 int slptimeo = 0; 1225 int error; 1226 1227 /* 1228 * If no async daemons then return EIO to force caller to run the rpc 1229 * synchronously. 1230 / 1231* if (nfs_numasync == 0) 1232 return (EIO); 1233 1234 nmp = VFSTONFS(bp->b_vp->v_mount); 1235 1236 /* 1237 * Commits are usually short and sweet so lets save some cpu and 1238 * leave the async daemons for more important rpc's (such as reads 1239 * and writes). 1240 / 1241* if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1242 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1243 return(EIO); 1244 } 1245 1246again: 1247 if (nmp->nm_flag & NFSMNT_INT) 1248 slpflag = PCATCH; 1249 gotiod = FALSE; 1250 1251 /* 1252 * Find a free iod to process this request. 1253 / 1254* for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 1255 if (nfs_iodwant[i]) { 1256 /* 1257 * Found one, so wake it up and tell it which 1258 * mount to process. 1259 / 1260* NFS_DPF(ASYNCIO, 1261 ("nfs_asyncio: waking iod %d for mount %p\n", 1262 i, nmp)); 1263 nfs_iodwant[i] = (struct proc )0; 1264* nfs_iodmount[i] = nmp; 1265 nmp->nm_bufqiods++; 1266 wakeup((caddr_t)&nfs_iodwant[i]); 1267 gotiod = TRUE; 1268 break; 1269 } 1270 1271 /* 1272 * If none are free, we may already have an iod working on this mount 1273 * point. If so, it will process our request. 1274 / 1275* if (!gotiod) { 1276 if (nmp->nm_bufqiods > 0) { 1277 NFS_DPF(ASYNCIO, 1278 ("nfs_asyncio: %d iods are already processing mount %p\n", 1279 nmp->nm_bufqiods, nmp)); 1280 gotiod = TRUE; 1281 } 1282 } 1283 1284 /* 1285 * If we have an iod which can process the request, then queue 1286 * the buffer. 1287 / 1288* if (gotiod) { 1289 /* 1290 * Ensure that the queue never grows too large. We still want 1291 * to asynchronize so we block rather then return EIO. 1292 / 1293* while (nmp->nm_bufqlen >= 2nfs_numasync) { 1294* NFS_DPF(ASYNCIO, 1295 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1296 nmp->nm_bufqwant = TRUE; 1297 error = tsleep(&nmp->nm_bufq, slpflag \| PRIBIO, 1298 "nfsaio", slptimeo); 1299 if (error) { 1300 if (nfs_sigintr(nmp, NULL, procp)) 1301 return (EINTR); 1302 if (slpflag == PCATCH) { 1303 slpflag = 0; 1304 slptimeo = 2 * hz; 1305 } 1306 } 1307 /* 1308 * We might have lost our iod while sleeping, 1309 * so check and loop if nescessary. 1310 / 1311* if (nmp->nm_bufqiods == 0) { 1312 NFS_DPF(ASYNCIO, 1313 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1314 goto again; 1315 } 1316 } 1317 1318 if (bp->b_iocmd == BIO_READ) { 1319 if (bp->b_rcred == NOCRED && cred != NOCRED) { 1320 crhold(cred); 1321 bp->b_rcred = cred; 1322 } 1323 } else { 1324 bp->b_flags \|= B_WRITEINPROG; 1325 if (bp->b_wcred == NOCRED && cred != NOCRED) { 1326 crhold(cred); 1327 bp->b_wcred = cred; 1328 } 1329 } 1330 1331 BUF_KERNPROC(bp); 1332 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1333 nmp->nm_bufqlen++; 1334 return (0); 1335 } 1336 1337 /* 1338 * All the iods are busy on other mounts, so return EIO to 1339 * force the caller to process the i/o synchronously. 1340 / 1341* NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1342 return (EIO); 1343} 1344 1345/* 1346 * Do an I/O operation to/from a cache block. This may be called 1347 * synchronously or from an nfsiod. 1348 / 1349int 1350nfs_doio(bp, cr, p) 1351* struct buf bp; 1352* struct ucred cr; 1353* struct proc p; 1354{ 1355* struct uio uiop; 1356* struct vnode vp; 1357* struct nfsnode np; 1358* struct nfsmount nmp; 1359* int error = 0, iomode, must_commit = 0; 1360 struct uio uio; 1361 struct iovec io; 1362 1363 vp = bp->b_vp; 1364 np = VTONFS(vp); 1365 nmp = VFSTONFS(vp->v_mount); 1366 uiop = &uio; 1367 uiop->uio_iov = &io; 1368 uiop->uio_iovcnt = 1; 1369 uiop->uio_segflg = UIO_SYSSPACE; 1370 uiop->uio_procp = p; 1371 1372 /* 1373 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1374 * do this here so we do not have to do it in all the code that 1375 * calls us. 1376 / 1377* bp->b_flags &= ~B_INVAL; 1378 bp->b_ioflags &= ~BIO_ERROR; 1379 1380 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1381 1382 /* 1383 * Historically, paging was done with physio, but no more. 1384 / 1385* if (bp->b_flags & B_PHYS) { 1386 /* 1387 * ...though reading /dev/drum still gets us here. 1388 / 1389* io.iov_len = uiop->uio_resid = bp->b_bcount; 1390 /* mapping was done by vmapbuf() / 1391* io.iov_base = bp->b_data; 1392 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1393 if (bp->b_iocmd == BIO_READ) { 1394 uiop->uio_rw = UIO_READ; 1395 nfsstats.read_physios++; 1396 error = nfs_readrpc(vp, uiop, cr); 1397 } else { 1398 int com; 1399 1400 iomode = NFSV3WRITE_DATASYNC; 1401 uiop->uio_rw = UIO_WRITE; 1402 nfsstats.write_physios++; 1403 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 1404 } 1405 if (error) { 1406 bp->b_ioflags \|= BIO_ERROR; 1407 bp->b_error = error; 1408 } 1409 } else if (bp->b_iocmd == BIO_READ) { 1410 io.iov_len = uiop->uio_resid = bp->b_bcount; 1411 io.iov_base = bp->b_data; 1412 uiop->uio_rw = UIO_READ; 1413 switch (vp->v_type) { 1414 case VREG: 1415 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1416 nfsstats.read_bios++; 1417 error = nfs_readrpc(vp, uiop, cr); 1418 if (!error) { 1419 if (uiop->uio_resid) { 1420 /* 1421 * If we had a short read with no error, we must have 1422 * hit a file hole. We should zero-fill the remainder. 1423 * This can also occur if the server hits the file EOF. 1424 * 1425 * Holes used to be able to occur due to pending 1426 * writes, but that is not possible any longer. 1427 / 1428* int nread = bp->b_bcount - uiop->uio_resid; 1429 int left = bp->b_bcount - nread; 1430 1431 if (left > 0) 1432 bzero((char )bp->b_data + nread, left); 1433* uiop->uio_resid = 0; 1434 } 1435 } 1436 if (p && (vp->v_flag & VTEXT) && 1437 (((nmp->nm_flag & NFSMNT_NQNFS) && 1438 NQNFS_CKINVALID(vp, np, ND_READ) && 1439 np->n_lrev != np->n_brev) \|\| 1440 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1441 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 1442 uprintf("Process killed due to text file modification\n"); 1443 psignal(p, SIGKILL); 1444 PHOLD(p); 1445 } 1446 break; 1447 case VLNK: 1448 uiop->uio_offset = (off_t)0; 1449 nfsstats.readlink_bios++; 1450 error = nfs_readlinkrpc(vp, uiop, cr); 1451 break; 1452 case VDIR: 1453 nfsstats.readdir_bios++; 1454 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1455 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1456 error = nfs_readdirplusrpc(vp, uiop, cr); 1457 if (error == NFSERR_NOTSUPP) 1458 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1459 } 1460 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1461 error = nfs_readdirrpc(vp, uiop, cr); 1462 /* 1463 * end-of-directory sets B_INVAL but does not generate an 1464 * error. 1465 / 1466* if (error == 0 && uiop->uio_resid == bp->b_bcount) 1467 bp->b_flags \|= B_INVAL; 1468 break; 1469 default: 1470 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1471 break; 1472 }; 1473 if (error) { 1474 bp->b_ioflags \|= BIO_ERROR; 1475 bp->b_error = error; 1476 } 1477 } else { 1478 /* 1479 * If we only need to commit, try to commit 1480 / 1481* if (bp->b_flags & B_NEEDCOMMIT) { 1482 int retv; 1483 off_t off; 1484 1485 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1486 bp->b_flags \|= B_WRITEINPROG; 1487 retv = nfs_commit( 1488 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1489 bp->b_wcred, p); 1490 bp->b_flags &= ~B_WRITEINPROG; 1491 if (retv == 0) { 1492 bp->b_dirtyoff = bp->b_dirtyend = 0; 1493 bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK); 1494 bp->b_resid = 0; 1495 bufdone(bp); 1496 return (0); 1497 } 1498 if (retv == NFSERR_STALEWRITEVERF) { 1499 nfs_clearcommit(bp->b_vp->v_mount); 1500 } 1501 } 1502 1503 /* 1504 * Setup for actual write 1505 / 1506* 1507 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1508 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1509 1510 if (bp->b_dirtyend > bp->b_dirtyoff) { 1511 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1512 - bp->b_dirtyoff; 1513 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1514 + bp->b_dirtyoff; 1515 io.iov_base = (char )bp->b_data + bp->b_dirtyoff; 1516* uiop->uio_rw = UIO_WRITE; 1517 nfsstats.write_bios++; 1518 1519 if ((bp->b_flags & (B_ASYNC \| B_NEEDCOMMIT \| B_NOCACHE \| B_CLUSTER)) == B_ASYNC) 1520 iomode = NFSV3WRITE_UNSTABLE; 1521 else 1522 iomode = NFSV3WRITE_FILESYNC; 1523 1524 bp->b_flags \|= B_WRITEINPROG; 1525 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1526 1527 /* 1528 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1529 * to cluster the buffers needing commit. This will allow 1530 * the system to submit a single commit rpc for the whole 1531 * cluster. We can do this even if the buffer is not 100% 1532 * dirty (relative to the NFS blocksize), so we optimize the 1533 * append-to-file-case. 1534 * 1535 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1536 * cleared because write clustering only works for commit 1537 * rpc's, not for the data portion of the write). 1538 / 1539* 1540 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1541 bp->b_flags \|= B_NEEDCOMMIT; 1542 if (bp->b_dirtyoff == 0 1543 && bp->b_dirtyend == bp->b_bcount) 1544 bp->b_flags \|= B_CLUSTEROK; 1545 } else { 1546 bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK); 1547 } 1548 bp->b_flags &= ~B_WRITEINPROG; 1549 1550 /* 1551 * For an interrupted write, the buffer is still valid 1552 * and the write hasn't been pushed to the server yet, 1553 * so we can't set BIO_ERROR and report the interruption 1554 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1555 * is not relevant, so the rpc attempt is essentially 1556 * a noop. For the case of a V3 write rpc not being 1557 * committed to stable storage, the block is still 1558 * dirty and requires either a commit rpc or another 1559 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1560 * the block is reused. This is indicated by setting 1561 * the B_DELWRI and B_NEEDCOMMIT flags. 1562 * 1563 * If the buffer is marked B_PAGING, it does not reside on 1564 * the vp's paging queues so we cannot call bdirty(). The 1565 * bp in this case is not an NFS cache block so we should 1566 * be safe. XXX 1567 / 1568* if (error == EINTR 1569 \|\| (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1570 int s; 1571 1572 s = splbio(); 1573 bp->b_flags &= ~(B_INVAL\|B_NOCACHE); 1574 if ((bp->b_flags & B_PAGING) == 0) { 1575 bdirty(bp); 1576 bp->b_flags &= ~B_DONE; 1577 } 1578 if (error && (bp->b_flags & B_ASYNC) == 0) 1579 bp->b_flags \|= B_EINTR; 1580 splx(s); 1581 } else { 1582 if (error) { 1583 bp->b_ioflags \|= BIO_ERROR; 1584 bp->b_error = np->n_error = error; 1585 np->n_flag \|= NWRITEERR; 1586 } 1587 bp->b_dirtyoff = bp->b_dirtyend = 0; 1588 } 1589 } else { 1590 bp->b_resid = 0; 1591 bufdone(bp); 1592 return (0); 1593 } 1594 } 1595 bp->b_resid = uiop->uio_resid; 1596 if (must_commit) 1597 nfs_clearcommit(vp->v_mount); 1598 bufdone(bp); 1599 return (error); 1600}