nfs_bio.c revision 9336
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94 37 * $Id: nfs_bio.c,v 1.14 1995/05/30 08:12:35 rgrimes Exp $ 38 */ 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/resourcevar.h> 43#include <sys/signalvar.h> 44#include <sys/proc.h> 45#include <sys/buf.h> 46#include <sys/vnode.h> 47#include <sys/mount.h> 48#include <sys/kernel.h> 49 50#include <vm/vm.h> 51 52#include <nfs/rpcv2.h> 53#include <nfs/nfsproto.h> 54#include <nfs/nfs.h> 55#include <nfs/nfsmount.h> 56#include <nfs/nqnfs.h> 57#include <nfs/nfsnode.h> 58 59struct buf *nfs_getcacheblk(); 60extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; 61extern int nfs_numasync; 62extern struct nfsstats nfsstats; 63 64/* 65 * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate 66 * that this isn't done inside getblk() and brelse() so these calls 67 * wouldn't need to be here. 68 */ 69#ifdef B_VMIO 70#define vnode_pager_uncache(vp) 71#else 72#define vfs_busy_pages(bp, f) 73#define vfs_unbusy_pages(bp) 74#define vfs_dirty_pages(bp) 75#endif 76 77/* 78 * Vnode op for read using bio 79 * Any similarity to readip() is purely coincidental 80 */ 81int 82nfs_bioread(vp, uio, ioflag, cred) 83 register struct vnode *vp; 84 register struct uio *uio; 85 int ioflag; 86 struct ucred *cred; 87{ 88 register struct nfsnode *np = VTONFS(vp); 89 register int biosize, diff, i; 90 struct buf *bp = 0, *rabp; 91 struct vattr vattr; 92 struct proc *p; 93 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 94 daddr_t lbn, rabn; 95 int bufsize; 96 int nra, error = 0, n = 0, on = 0, not_readin; 97 nfsquad_t tquad; 98 99#ifdef DIAGNOSTIC 100 if (uio->uio_rw != UIO_READ) 101 panic("nfs_read mode"); 102#endif 103 if (uio->uio_resid == 0) 104 return (0); 105 if (uio->uio_offset < 0) 106 return (EINVAL); 107 p = uio->uio_procp; 108 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 109 (void)nfs_fsinfo(nmp, vp, cred, p); 110 biosize = nmp->nm_rsize; 111 /* 112 * For nfs, cache consistency can only be maintained approximately. 113 * Although RFC1094 does not specify the criteria, the following is 114 * believed to be compatible with the reference port. 115 * For nqnfs, full cache consistency is maintained within the loop. 116 * For nfs: 117 * If the file's modify time on the server has changed since the 118 * last read rpc or you have written to the file, 119 * you may have lost data cache consistency with the 120 * server, so flush all of the file's data out of the cache. 121 * Then force a getattr rpc to ensure that you have up to date 122 * attributes. 123 * NB: This implies that cache data can be read when up to 124 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 125 * attributes this could be forced by setting n_attrstamp to 0 before 126 * the VOP_GETATTR() call. 127 */ 128 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { 129 if (np->n_flag & NMODIFIED) { 130 if (vp->v_type != VREG) { 131 if (vp->v_type != VDIR) 132 panic("nfs: bioread, not dir"); 133 nfs_invaldir(vp); 134 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 135 if (error) 136 return (error); 137 } 138 np->n_attrstamp = 0; 139 error = VOP_GETATTR(vp, &vattr, cred, p); 140 if (error) 141 return (error); 142 np->n_mtime = vattr.va_mtime.ts_sec; 143 } else { 144 error = VOP_GETATTR(vp, &vattr, cred, p); 145 if (error) 146 return (error); 147 if (np->n_mtime != vattr.va_mtime.ts_sec) { 148 if (vp->v_type == VDIR) 149 nfs_invaldir(vp); 150 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 151 if (error) 152 return (error); 153 np->n_mtime = vattr.va_mtime.ts_sec; 154 } 155 } 156 } 157 do { 158 159 /* 160 * Get a valid lease. If cached data is stale, flush it. 161 */ 162 if (nmp->nm_flag & NFSMNT_NQNFS) { 163 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 164 do { 165 error = nqnfs_getlease(vp, ND_READ, cred, p); 166 } while (error == NQNFS_EXPIRED); 167 if (error) 168 return (error); 169 if (np->n_lrev != np->n_brev || 170 (np->n_flag & NQNFSNONCACHE) || 171 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 172 if (vp->v_type == VDIR) 173 nfs_invaldir(vp); 174 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 175 if (error) 176 return (error); 177 np->n_brev = np->n_lrev; 178 } 179 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 180 nfs_invaldir(vp); 181 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 182 if (error) 183 return (error); 184 } 185 } 186 if (np->n_flag & NQNFSNONCACHE) { 187 switch (vp->v_type) { 188 case VREG: 189 return (nfs_readrpc(vp, uio, cred)); 190 case VLNK: 191 return (nfs_readlinkrpc(vp, uio, cred)); 192 case VDIR: 193 break; 194 default: 195 printf(" NQNFSNONCACHE: type %x unexpected\n", 196 vp->v_type); 197 }; 198 } 199 switch (vp->v_type) { 200 case VREG: 201 nfsstats.biocache_reads++; 202 lbn = uio->uio_offset / biosize; 203 on = uio->uio_offset & (biosize - 1); 204 not_readin = 1; 205 206 /* 207 * Start the read ahead(s), as required. 208 */ 209 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 210 for (nra = 0; nra < nmp->nm_readahead && 211 (lbn + 1 + nra) * biosize < np->n_size; nra++) { 212 rabn = lbn + 1 + nra; 213 if (!incore(vp, rabn)) { 214 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 215 if (!rabp) 216 return (EINTR); 217 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 218 rabp->b_flags |= (B_READ | B_ASYNC); 219 vfs_busy_pages(rabp, 0); 220 if (nfs_asyncio(rabp, cred)) { 221 rabp->b_flags |= B_INVAL|B_ERROR; 222 vfs_unbusy_pages(rabp); 223 brelse(rabp); 224 } 225 } else { 226 brelse(rabp); 227 } 228 } 229 } 230 } 231 232 /* 233 * If the block is in the cache and has the required data 234 * in a valid region, just copy it out. 235 * Otherwise, get the block and write back/read in, 236 * as required. 237 */ 238again: 239 bufsize = biosize; 240 if ((lbn + 1) * biosize > np->n_size) { 241 bufsize = np->n_size - lbn * biosize; 242 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 243 } 244 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 245 if (!bp) 246 return (EINTR); 247 if ((bp->b_flags & B_CACHE) == 0) { 248 bp->b_flags |= B_READ; 249 not_readin = 0; 250 vfs_busy_pages(bp, 0); 251 error = nfs_doio(bp, cred, p); 252 if (error) { 253 brelse(bp); 254 return (error); 255 } 256 } 257 if (bufsize > on) { 258 n = min((unsigned)(bufsize - on), uio->uio_resid); 259 } else { 260 n = 0; 261 } 262 diff = np->n_size - uio->uio_offset; 263 if (diff < n) 264 n = diff; 265 if (not_readin && n > 0) { 266 if (on < bp->b_validoff || (on + n) > bp->b_validend) { 267 bp->b_flags |= B_NOCACHE; 268 if (bp->b_dirtyend > 0) { 269 if ((bp->b_flags & B_DELWRI) == 0) 270 panic("nfsbioread"); 271 if (VOP_BWRITE(bp) == EINTR) 272 return (EINTR); 273 } else 274 brelse(bp); 275 goto again; 276 } 277 } 278 vp->v_lastr = lbn; 279 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); 280 if (diff < n) 281 n = diff; 282 break; 283 case VLNK: 284 nfsstats.biocache_readlinks++; 285 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 286 if (!bp) 287 return (EINTR); 288 if ((bp->b_flags & B_CACHE) == 0) { 289 bp->b_flags |= B_READ; 290 vfs_busy_pages(bp, 0); 291 error = nfs_doio(bp, cred, p); 292 if (error) { 293 bp->b_flags |= B_ERROR; 294 brelse(bp); 295 return (error); 296 } 297 } 298 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 299 on = 0; 300 break; 301 case VDIR: 302 nfsstats.biocache_readdirs++; 303 lbn = uio->uio_offset / NFS_DIRBLKSIZ; 304 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 305 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 306 if (!bp) 307 return (EINTR); 308 if ((bp->b_flags & B_CACHE) == 0) { 309 bp->b_flags |= B_READ; 310 vfs_busy_pages(bp, 0); 311 error = nfs_doio(bp, cred, p); 312 if (error) { 313 brelse(bp); 314 while (error == NFSERR_BAD_COOKIE) { 315 nfs_invaldir(vp); 316 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 317 /* 318 * Yuck! The directory has been modified on the 319 * server. The only way to get the block is by 320 * reading from the beginning to get all the 321 * offset cookies. 322 */ 323 for (i = 0; i <= lbn && !error; i++) { 324 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 325 if (!bp) 326 return (EINTR); 327 if ((bp->b_flags & B_DONE) == 0) { 328 bp->b_flags |= B_READ; 329 vfs_busy_pages(bp, 0); 330 error = nfs_doio(bp, cred, p); 331 if (error) 332 brelse(bp); 333 } 334 } 335 } 336 if (error) 337 return (error); 338 } 339 } 340 341 /* 342 * If not eof and read aheads are enabled, start one. 343 * (You need the current block first, so that you have the 344 * directory offset cookie of the next block.) 345 */ 346 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 347 (np->n_direofoffset == 0 || 348 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 349 !(np->n_flag & NQNFSNONCACHE) && 350 !incore(vp, lbn + 1)) { 351 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 352 if (rabp) { 353 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 354 rabp->b_flags |= (B_READ | B_ASYNC); 355 vfs_busy_pages(rabp, 0); 356 if (nfs_asyncio(rabp, cred)) { 357 rabp->b_flags |= B_INVAL|B_ERROR; 358 vfs_unbusy_pages(rabp); 359 brelse(rabp); 360 } 361 } else { 362 brelse(rabp); 363 } 364 } 365 } 366 n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 367 break; 368 default: 369 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 370 break; 371 }; 372 373 if (n > 0) { 374 error = uiomove(bp->b_data + on, (int)n, uio); 375 } 376 switch (vp->v_type) { 377 case VREG: 378 break; 379 case VLNK: 380 n = 0; 381 break; 382 case VDIR: 383 if (np->n_flag & NQNFSNONCACHE) 384 bp->b_flags |= B_INVAL; 385 break; 386 default: 387 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 388 } 389 brelse(bp); 390 } while (error == 0 && uio->uio_resid > 0 && n > 0); 391 return (error); 392} 393 394/* 395 * Vnode op for write using bio 396 */ 397int 398nfs_write(ap) 399 struct vop_write_args /* { 400 struct vnode *a_vp; 401 struct uio *a_uio; 402 int a_ioflag; 403 struct ucred *a_cred; 404 } */ *ap; 405{ 406 register int biosize; 407 register struct uio *uio = ap->a_uio; 408 struct proc *p = uio->uio_procp; 409 register struct vnode *vp = ap->a_vp; 410 struct nfsnode *np = VTONFS(vp); 411 register struct ucred *cred = ap->a_cred; 412 int ioflag = ap->a_ioflag; 413 struct buf *bp; 414 struct vattr vattr; 415 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 416 daddr_t lbn, bn; 417 int bufsize; 418 int n, on, error = 0, iomode, must_commit; 419 420#ifdef DIAGNOSTIC 421 if (uio->uio_rw != UIO_WRITE) 422 panic("nfs_write mode"); 423 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 424 panic("nfs_write proc"); 425#endif 426 if (vp->v_type != VREG) 427 return (EIO); 428 if (np->n_flag & NWRITEERR) { 429 np->n_flag &= ~NWRITEERR; 430 return (np->n_error); 431 } 432 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 433 (void)nfs_fsinfo(nmp, vp, cred, p); 434 if (ioflag & (IO_APPEND | IO_SYNC)) { 435 if (np->n_flag & NMODIFIED) { 436 np->n_attrstamp = 0; 437 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 438 if (error) 439 return (error); 440 } 441 if (ioflag & IO_APPEND) { 442 np->n_attrstamp = 0; 443 error = VOP_GETATTR(vp, &vattr, cred, p); 444 if (error) 445 return (error); 446 uio->uio_offset = np->n_size; 447 } 448 } 449 if (uio->uio_offset < 0) 450 return (EINVAL); 451 if (uio->uio_resid == 0) 452 return (0); 453 /* 454 * Maybe this should be above the vnode op call, but so long as 455 * file servers have no limits, i don't think it matters 456 */ 457 if (p && uio->uio_offset + uio->uio_resid > 458 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 459 psignal(p, SIGXFSZ); 460 return (EFBIG); 461 } 462 /* 463 * I use nm_rsize, not nm_wsize so that all buffer cache blocks 464 * will be the same size within a filesystem. nfs_writerpc will 465 * still use nm_wsize when sizing the rpc's. 466 */ 467 biosize = nmp->nm_rsize; 468 do { 469 470 /* 471 * XXX make sure we aren't cached in the VM page cache 472 */ 473 /* 474 * Check for a valid write lease. 475 */ 476 if ((nmp->nm_flag & NFSMNT_NQNFS) && 477 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 478 do { 479 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 480 } while (error == NQNFS_EXPIRED); 481 if (error) 482 return (error); 483 if (np->n_lrev != np->n_brev || 484 (np->n_flag & NQNFSNONCACHE)) { 485 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 486 if (error) 487 return (error); 488 np->n_brev = np->n_lrev; 489 } 490 } 491 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 492 iomode = NFSV3WRITE_FILESYNC; 493 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 494 if (must_commit) 495 nfs_clearcommit(vp->v_mount); 496 return (error); 497 } 498 nfsstats.biocache_writes++; 499 lbn = uio->uio_offset / biosize; 500 on = uio->uio_offset & (biosize-1); 501 n = min((unsigned)(biosize - on), uio->uio_resid); 502again: 503 if (uio->uio_offset + n > np->n_size) { 504 np->n_size = uio->uio_offset + n; 505 vnode_pager_setsize(vp, (u_long)np->n_size); 506 } 507 bufsize = biosize; 508 if ((lbn + 1) * biosize > np->n_size) { 509 bufsize = np->n_size - lbn * biosize; 510 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 511 } 512 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 513 if (!bp) 514 return (EINTR); 515 if (bp->b_wcred == NOCRED) { 516 crhold(cred); 517 bp->b_wcred = cred; 518 } 519 np->n_flag |= NMODIFIED; 520 521 if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) { 522 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 523 } 524 525 /* 526 * If the new write will leave a contiguous dirty 527 * area, just update the b_dirtyoff and b_dirtyend, 528 * otherwise force a write rpc of the old dirty area. 529 */ 530 if (bp->b_dirtyend > 0 && 531 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 532 bp->b_proc = p; 533 if (VOP_BWRITE(bp) == EINTR) 534 return (EINTR); 535 goto again; 536 } 537 538 /* 539 * Check for valid write lease and get one as required. 540 * In case getblk() and/or bwrite() delayed us. 541 */ 542 if ((nmp->nm_flag & NFSMNT_NQNFS) && 543 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 544 do { 545 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 546 } while (error == NQNFS_EXPIRED); 547 if (error) { 548 brelse(bp); 549 return (error); 550 } 551 if (np->n_lrev != np->n_brev || 552 (np->n_flag & NQNFSNONCACHE)) { 553 brelse(bp); 554 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 555 if (error) 556 return (error); 557 np->n_brev = np->n_lrev; 558 goto again; 559 } 560 } 561 error = uiomove((char *)bp->b_data + on, n, uio); 562 if (error) { 563 bp->b_flags |= B_ERROR; 564 brelse(bp); 565 return (error); 566 } 567 if (bp->b_dirtyend > 0) { 568 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 569 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 570 } else { 571 bp->b_dirtyoff = on; 572 bp->b_dirtyend = on + n; 573 } 574 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || 575 bp->b_validoff > bp->b_dirtyend) { 576 bp->b_validoff = bp->b_dirtyoff; 577 bp->b_validend = bp->b_dirtyend; 578 } else { 579 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 580 bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 581 } 582 /* 583 * If the lease is non-cachable or IO_SYNC do bwrite(). 584 */ 585 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 586 bp->b_proc = p; 587 error = VOP_BWRITE(bp); 588 if (error) 589 return (error); 590 if (np->n_flag & NQNFSNONCACHE) { 591 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 592 if (error) 593 return (error); 594 } 595 } else if ((n + on) == biosize && 596 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 597 bp->b_proc = (struct proc *)0; 598 bp->b_flags |= B_ASYNC; 599 (void)nfs_writebp(bp, 0); 600 } else 601 bdwrite(bp); 602 } while (uio->uio_resid > 0 && n > 0); 603 return (0); 604} 605 606/* 607 * Get an nfs cache block. 608 * Allocate a new one if the block isn't currently in the cache 609 * and return the block marked busy. If the calling process is 610 * interrupted by a signal for an interruptible mount point, return 611 * NULL. 612 */ 613struct buf * 614nfs_getcacheblk(vp, bn, size, p) 615 struct vnode *vp; 616 daddr_t bn; 617 int size; 618 struct proc *p; 619{ 620 register struct buf *bp; 621 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 622 int biosize = nmp->nm_rsize; 623 624 if (nmp->nm_flag & NFSMNT_INT) { 625 bp = getblk(vp, bn, size, PCATCH, 0); 626 while (bp == (struct buf *)0) { 627 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 628 return ((struct buf *)0); 629 bp = getblk(vp, bn, size, 0, 2 * hz); 630 } 631 } else 632 bp = getblk(vp, bn, size, 0, 0); 633 634 if( vp->v_type == VREG) 635 bp->b_blkno = (bn * biosize) / DEV_BSIZE; 636 637 return (bp); 638} 639 640/* 641 * Flush and invalidate all dirty buffers. If another process is already 642 * doing the flush, just wait for completion. 643 */ 644int 645nfs_vinvalbuf(vp, flags, cred, p, intrflg) 646 struct vnode *vp; 647 int flags; 648 struct ucred *cred; 649 struct proc *p; 650 int intrflg; 651{ 652 register struct nfsnode *np = VTONFS(vp); 653 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 654 int error = 0, slpflag, slptimeo; 655 656 if ((nmp->nm_flag & NFSMNT_INT) == 0) 657 intrflg = 0; 658 if (intrflg) { 659 slpflag = PCATCH; 660 slptimeo = 2 * hz; 661 } else { 662 slpflag = 0; 663 slptimeo = 0; 664 } 665 /* 666 * First wait for any other process doing a flush to complete. 667 */ 668 while (np->n_flag & NFLUSHINPROG) { 669 np->n_flag |= NFLUSHWANT; 670 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 671 slptimeo); 672 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 673 return (EINTR); 674 } 675 676 /* 677 * Now, flush as required. 678 */ 679 np->n_flag |= NFLUSHINPROG; 680 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 681 while (error) { 682 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 683 np->n_flag &= ~NFLUSHINPROG; 684 if (np->n_flag & NFLUSHWANT) { 685 np->n_flag &= ~NFLUSHWANT; 686 wakeup((caddr_t)&np->n_flag); 687 } 688 return (EINTR); 689 } 690 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 691 } 692 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 693 if (np->n_flag & NFLUSHWANT) { 694 np->n_flag &= ~NFLUSHWANT; 695 wakeup((caddr_t)&np->n_flag); 696 } 697 return (0); 698} 699 700/* 701 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 702 * This is mainly to avoid queueing async I/O requests when the nfsiods 703 * are all hung on a dead server. 704 */ 705int 706nfs_asyncio(bp, cred) 707 register struct buf *bp; 708 struct ucred *cred; 709{ 710 register int i; 711 712 if (nfs_numasync == 0) 713 return (EIO); 714 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 715 if (nfs_iodwant[i]) { 716 if (bp->b_flags & B_READ) { 717 if (bp->b_rcred == NOCRED && cred != NOCRED) { 718 crhold(cred); 719 bp->b_rcred = cred; 720 } 721 } else { 722 bp->b_flags |= B_WRITEINPROG; 723 if (bp->b_wcred == NOCRED && cred != NOCRED) { 724 crhold(cred); 725 bp->b_wcred = cred; 726 } 727 } 728 729 TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist); 730 nfs_iodwant[i] = (struct proc *)0; 731 wakeup((caddr_t)&nfs_iodwant[i]); 732 return (0); 733 } 734 735 /* 736 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE 737 * return EIO so the process will call nfs_doio() and do it 738 * synchronously. 739 */ 740 if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE)) 741 return (EIO); 742 743 /* 744 * Just turn the async write into a delayed write, instead of 745 * doing in synchronously. Hopefully, at least one of the nfsiods 746 * is currently doing a write for this file and will pick up the 747 * delayed writes before going back to sleep. 748 */ 749 bp->b_flags |= B_DELWRI; 750 reassignbuf(bp, bp->b_vp); 751 biodone(bp); 752 return (0); 753} 754 755/* 756 * Do an I/O operation to/from a cache block. This may be called 757 * synchronously or from an nfsiod. 758 */ 759int 760nfs_doio(bp, cr, p) 761 register struct buf *bp; 762 struct ucred *cr; 763 struct proc *p; 764{ 765 register struct uio *uiop; 766 register struct vnode *vp; 767 struct nfsnode *np; 768 struct nfsmount *nmp; 769 int error = 0, diff, len, iomode, must_commit = 0; 770 struct uio uio; 771 struct iovec io; 772 nfsquad_t tquad; 773 774 vp = bp->b_vp; 775 np = VTONFS(vp); 776 nmp = VFSTONFS(vp->v_mount); 777 uiop = &uio; 778 uiop->uio_iov = &io; 779 uiop->uio_iovcnt = 1; 780 uiop->uio_segflg = UIO_SYSSPACE; 781 uiop->uio_procp = p; 782 783 /* 784 * Historically, paging was done with physio, but no more. 785 */ 786 if (bp->b_flags & B_PHYS) { 787 /* 788 * ...though reading /dev/drum still gets us here. 789 */ 790 io.iov_len = uiop->uio_resid = bp->b_bcount; 791 /* mapping was done by vmapbuf() */ 792 io.iov_base = bp->b_data; 793 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 794 if (bp->b_flags & B_READ) { 795 uiop->uio_rw = UIO_READ; 796 nfsstats.read_physios++; 797 error = nfs_readrpc(vp, uiop, cr); 798 } else { 799 int com; 800 801 iomode = NFSV3WRITE_DATASYNC; 802 uiop->uio_rw = UIO_WRITE; 803 nfsstats.write_physios++; 804 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 805 } 806 if (error) { 807 bp->b_flags |= B_ERROR; 808 bp->b_error = error; 809 } 810 } else if (bp->b_flags & B_READ) { 811 io.iov_len = uiop->uio_resid = bp->b_bcount; 812 io.iov_base = bp->b_data; 813 uiop->uio_rw = UIO_READ; 814 switch (vp->v_type) { 815 case VREG: 816 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 817 nfsstats.read_bios++; 818 error = nfs_readrpc(vp, uiop, cr); 819 if (!error) { 820 bp->b_validoff = 0; 821 if (uiop->uio_resid) { 822 /* 823 * If len > 0, there is a hole in the file and 824 * no writes after the hole have been pushed to 825 * the server yet. 826 * Just zero fill the rest of the valid area. 827 */ 828 diff = bp->b_bcount - uiop->uio_resid; 829 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE 830 + diff); 831 if (len > 0) { 832 len = min(len, uiop->uio_resid); 833 bzero((char *)bp->b_data + diff, len); 834 bp->b_validend = diff + len; 835 } else 836 bp->b_validend = diff; 837 } else 838 bp->b_validend = bp->b_bcount; 839 } 840 if (p && (vp->v_flag & VTEXT) && 841 (((nmp->nm_flag & NFSMNT_NQNFS) && 842 NQNFS_CKINVALID(vp, np, ND_READ) && 843 np->n_lrev != np->n_brev) || 844 (!(nmp->nm_flag & NFSMNT_NQNFS) && 845 np->n_mtime != np->n_vattr.va_mtime.ts_sec))) { 846 uprintf("Process killed due to text file modification\n"); 847 psignal(p, SIGKILL); 848#ifdef __NetBSD__ 849 p->p_holdcnt++; 850#else 851 p->p_flag |= P_NOSWAP; 852#endif 853 } 854 break; 855 case VLNK: 856 uiop->uio_offset = (off_t)0; 857 nfsstats.readlink_bios++; 858 error = nfs_readlinkrpc(vp, uiop, cr); 859 break; 860 case VDIR: 861 nfsstats.readdir_bios++; 862 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 863 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 864 error = nfs_readdirplusrpc(vp, uiop, cr); 865 if (error == NFSERR_NOTSUPP) 866 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 867 } 868 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 869 error = nfs_readdirrpc(vp, uiop, cr); 870 break; 871 default: 872 printf("nfs_doio: type %x unexpected\n",vp->v_type); 873 break; 874 }; 875 if (error) { 876 bp->b_flags |= B_ERROR; 877 bp->b_error = error; 878 } 879 } else { 880 if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size) 881 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 882 883 if (bp->b_dirtyend > bp->b_dirtyoff) { 884 io.iov_len = uiop->uio_resid = bp->b_dirtyend 885 - bp->b_dirtyoff; 886 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE 887 + bp->b_dirtyoff; 888 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 889 uiop->uio_rw = UIO_WRITE; 890 nfsstats.write_bios++; 891 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC) 892 iomode = NFSV3WRITE_UNSTABLE; 893 else 894 iomode = NFSV3WRITE_FILESYNC; 895 bp->b_flags |= B_WRITEINPROG; 896 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 897 if (!error && iomode == NFSV3WRITE_UNSTABLE) 898 bp->b_flags |= B_NEEDCOMMIT; 899 else 900 bp->b_flags &= ~B_NEEDCOMMIT; 901 bp->b_flags &= ~B_WRITEINPROG; 902 903 /* 904 * For an interrupted write, the buffer is still valid 905 * and the write hasn't been pushed to the server yet, 906 * so we can't set B_ERROR and report the interruption 907 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 908 * is not relevant, so the rpc attempt is essentially 909 * a noop. For the case of a V3 write rpc not being 910 * committed to stable storage, the block is still 911 * dirty and requires either a commit rpc or another 912 * write rpc with iomode == NFSV3WRITE_FILESYNC before 913 * the block is reused. This is indicated by setting 914 * the B_DELWRI and B_NEEDCOMMIT flags. 915 */ 916 if (error == EINTR 917 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 918 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 919 bp->b_flags |= B_DELWRI; 920 921 /* 922 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the 923 * buffer to the clean list, we have to reassign it back to the 924 * dirty one. Ugh. 925 */ 926 if (bp->b_flags & B_ASYNC) 927 reassignbuf(bp, vp); 928 else 929 bp->b_flags |= B_EINTR; 930 } else { 931 if (error) { 932 bp->b_flags |= B_ERROR; 933 bp->b_error = np->n_error = error; 934 np->n_flag |= NWRITEERR; 935 } 936 bp->b_dirtyoff = bp->b_dirtyend = 0; 937 } 938 } else { 939 bp->b_resid = 0; 940 biodone(bp); 941 return (0); 942 } 943 } 944 bp->b_resid = uiop->uio_resid; 945 if (must_commit) 946 nfs_clearcommit(vp->v_mount); 947 biodone(bp); 948 return (error); 949} 950