nfs_bio.c revision 148268
1139823Simp/*- 21541Srgrimes * Copyright (c) 1989, 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * This code is derived from software contributed to Berkeley by 61541Srgrimes * Rick Macklem at The University of Guelph. 71541Srgrimes * 81541Srgrimes * Redistribution and use in source and binary forms, with or without 91541Srgrimes * modification, are permitted provided that the following conditions 101541Srgrimes * are met: 111541Srgrimes * 1. Redistributions of source code must retain the above copyright 121541Srgrimes * notice, this list of conditions and the following disclaimer. 131541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141541Srgrimes * notice, this list of conditions and the following disclaimer in the 151541Srgrimes * documentation and/or other materials provided with the distribution. 161541Srgrimes * 4. Neither the name of the University nor the names of its contributors 171541Srgrimes * may be used to endorse or promote products derived from this software 181541Srgrimes * without specific prior written permission. 191541Srgrimes * 201541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 211541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 221541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 231541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 241541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 251541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 261541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 271541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 281541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 291541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 301541Srgrimes * SUCH DAMAGE. 311541Srgrimes * 3222521Sdyson * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 331541Srgrimes */ 341541Srgrimes 3583651Speter#include <sys/cdefs.h> 3683654Speter__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_bio.c 148268 2005-07-21 22:46:56Z ps $"); 3722521Sdyson 381541Srgrimes#include <sys/param.h> 391541Srgrimes#include <sys/systm.h> 4079247Sjhb#include <sys/bio.h> 4179247Sjhb#include <sys/buf.h> 4279247Sjhb#include <sys/kernel.h> 4379247Sjhb#include <sys/mount.h> 4479247Sjhb#include <sys/proc.h> 451541Srgrimes#include <sys/resourcevar.h> 463305Sphk#include <sys/signalvar.h> 4779247Sjhb#include <sys/vmmeter.h> 481541Srgrimes#include <sys/vnode.h> 491541Srgrimes 501541Srgrimes#include <vm/vm.h> 5112662Sdg#include <vm/vm_extern.h> 5225930Sdfr#include <vm/vm_page.h> 5325930Sdfr#include <vm/vm_object.h> 5425930Sdfr#include <vm/vm_pager.h> 5525930Sdfr#include <vm/vnode_pager.h> 561541Srgrimes 57122698Salfred#include <rpc/rpcclnt.h> 58122698Salfred 591541Srgrimes#include <nfs/rpcv2.h> 609336Sdfr#include <nfs/nfsproto.h> 6183651Speter#include <nfsclient/nfs.h> 6283651Speter#include <nfsclient/nfsmount.h> 6383651Speter#include <nfsclient/nfsnode.h> 641541Srgrimes 65122698Salfred#include <nfs4client/nfs4.h> 66122698Salfred 6783651Speterstatic struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, 6883651Speter struct thread *td); 69138899Spsstatic int nfs_directio_write(struct vnode *vp, struct uio *uiop, 70138899Sps struct ucred *cred, int ioflag); 7175580Sphk 72138899Spsextern int nfs_directio_enable; 73138899Spsextern int nfs_directio_allow_mmap; 741541Srgrimes/* 7525930Sdfr * Vnode op for VM getpages. 7625930Sdfr */ 7725930Sdfrint 7883651Speternfs_getpages(struct vop_getpages_args *ap) 7925930Sdfr{ 8046349Salc int i, error, nextoff, size, toff, count, npages; 8132755Sdyson struct uio uio; 8232755Sdyson struct iovec iov; 8332755Sdyson vm_offset_t kva; 8434206Sdyson struct buf *bp; 8536563Speter struct vnode *vp; 8683366Sjulian struct thread *td; 8736563Speter struct ucred *cred; 8836563Speter struct nfsmount *nmp; 89116461Salc vm_object_t object; 9036563Speter vm_page_t *pages; 91138899Sps struct nfsnode *np; 9225930Sdfr 9379224Sdillon GIANT_REQUIRED; 9479224Sdillon 9536563Speter vp = ap->a_vp; 96138899Sps np = VTONFS(vp); 9783366Sjulian td = curthread; /* XXX */ 9891406Sjhb cred = curthread->td_ucred; /* XXX */ 9936563Speter nmp = VFSTONFS(vp->v_mount); 10036563Speter pages = ap->a_m; 10136563Speter count = ap->a_count; 10236563Speter 103116461Salc if ((object = vp->v_object) == NULL) { 10432286Sdyson printf("nfs_getpages: called with non-merged cache vnode??\n"); 10536563Speter return VM_PAGER_ERROR; 10625930Sdfr } 10725930Sdfr 108138899Sps if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && 109138899Sps (vp->v_type == VREG)) { 110138899Sps printf("nfs_getpages: called on non-cacheable vnode??\n"); 111138899Sps return VM_PAGER_ERROR; 112138899Sps } 113138899Sps 11436563Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 11576827Salfred (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 116122698Salfred /* We'll never get here for v4, because we always have fsinfo */ 11783366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 11876827Salfred } 11946349Salc 12046349Salc npages = btoc(count); 12146349Salc 12234206Sdyson /* 12346349Salc * If the requested page is partially valid, just return it and 12446349Salc * allow the pager to zero-out the blanks. Partially valid pages 12546349Salc * can only occur at the file EOF. 12646349Salc */ 12746349Salc 12846349Salc { 12946349Salc vm_page_t m = pages[ap->a_reqpage]; 13046349Salc 131116461Salc VM_OBJECT_LOCK(object); 132100450Salc vm_page_lock_queues(); 13346349Salc if (m->valid != 0) { 13446349Salc /* handled by vm_fault now */ 13546349Salc /* vm_page_zero_invalid(m, TRUE); */ 13646349Salc for (i = 0; i < npages; ++i) { 13746349Salc if (i != ap->a_reqpage) 13875692Salfred vm_page_free(pages[i]); 13946349Salc } 140100450Salc vm_page_unlock_queues(); 141116461Salc VM_OBJECT_UNLOCK(object); 14246349Salc return(0); 14346349Salc } 144100450Salc vm_page_unlock_queues(); 145116461Salc VM_OBJECT_UNLOCK(object); 14646349Salc } 14746349Salc 14846349Salc /* 14934206Sdyson * We use only the kva address for the buffer, but this is extremely 15034206Sdyson * convienient and fast. 15134206Sdyson */ 15242957Sdillon bp = getpbuf(&nfs_pbuf_freecnt); 15325930Sdfr 15434206Sdyson kva = (vm_offset_t) bp->b_data; 15536563Speter pmap_qenter(kva, pages, npages); 15679247Sjhb cnt.v_vnodein++; 15779247Sjhb cnt.v_vnodepgsin += npages; 15834206Sdyson 15932755Sdyson iov.iov_base = (caddr_t) kva; 16036563Speter iov.iov_len = count; 16132755Sdyson uio.uio_iov = &iov; 16232755Sdyson uio.uio_iovcnt = 1; 16336563Speter uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 16436563Speter uio.uio_resid = count; 16532755Sdyson uio.uio_segflg = UIO_SYSSPACE; 16632755Sdyson uio.uio_rw = UIO_READ; 16783366Sjulian uio.uio_td = td; 16825930Sdfr 169122953Salfred error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred); 17034206Sdyson pmap_qremove(kva, npages); 17132755Sdyson 17242957Sdillon relpbuf(bp, &nfs_pbuf_freecnt); 17334206Sdyson 17442957Sdillon if (error && (uio.uio_resid == count)) { 17542957Sdillon printf("nfs_getpages: error %d\n", error); 176116461Salc VM_OBJECT_LOCK(object); 177100450Salc vm_page_lock_queues(); 17842957Sdillon for (i = 0; i < npages; ++i) { 17942957Sdillon if (i != ap->a_reqpage) 18075692Salfred vm_page_free(pages[i]); 18142957Sdillon } 182100450Salc vm_page_unlock_queues(); 183116461Salc VM_OBJECT_UNLOCK(object); 18434206Sdyson return VM_PAGER_ERROR; 18542957Sdillon } 18634206Sdyson 18745347Sjulian /* 18845347Sjulian * Calculate the number of bytes read and validate only that number 18945347Sjulian * of bytes. Note that due to pending writes, size may be 0. This 19045347Sjulian * does not mean that the remaining data is invalid! 19145347Sjulian */ 19245347Sjulian 19336563Speter size = count - uio.uio_resid; 194116461Salc VM_OBJECT_LOCK(object); 195100450Salc vm_page_lock_queues(); 19634206Sdyson for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 19734206Sdyson vm_page_t m; 19834206Sdyson nextoff = toff + PAGE_SIZE; 19936563Speter m = pages[i]; 20034206Sdyson 20134206Sdyson if (nextoff <= size) { 20245347Sjulian /* 20345347Sjulian * Read operation filled an entire page 20445347Sjulian */ 20534206Sdyson m->valid = VM_PAGE_BITS_ALL; 20649945Salc vm_page_undirty(m); 20745347Sjulian } else if (size > toff) { 20845347Sjulian /* 20946349Salc * Read operation filled a partial page. 21045347Sjulian */ 21146349Salc m->valid = 0; 21245347Sjulian vm_page_set_validclean(m, 0, size - toff); 21346349Salc /* handled by vm_fault now */ 21446349Salc /* vm_page_zero_invalid(m, TRUE); */ 21587834Sdillon } else { 21687834Sdillon /* 21787834Sdillon * Read operation was short. If no error occured 21887834Sdillon * we may have hit a zero-fill section. We simply 21987834Sdillon * leave valid set to 0. 22087834Sdillon */ 22187834Sdillon ; 22234206Sdyson } 22325930Sdfr if (i != ap->a_reqpage) { 22434206Sdyson /* 22534206Sdyson * Whether or not to leave the page activated is up in 22634206Sdyson * the air, but we should put the page on a page queue 22734206Sdyson * somewhere (it already is in the object). Result: 22834206Sdyson * It appears that emperical results show that 22934206Sdyson * deactivating pages is best. 23034206Sdyson */ 23134206Sdyson 23234206Sdyson /* 23334206Sdyson * Just in case someone was asking for this page we 23434206Sdyson * now tell them that it is ok to use. 23534206Sdyson */ 23634206Sdyson if (!error) { 23734206Sdyson if (m->flags & PG_WANTED) 23834206Sdyson vm_page_activate(m); 23934206Sdyson else 24034206Sdyson vm_page_deactivate(m); 24138799Sdfr vm_page_wakeup(m); 24234206Sdyson } else { 24375692Salfred vm_page_free(m); 24434206Sdyson } 24525930Sdfr } 24625930Sdfr } 247100450Salc vm_page_unlock_queues(); 248116461Salc VM_OBJECT_UNLOCK(object); 24925930Sdfr return 0; 25025930Sdfr} 25125930Sdfr 25225930Sdfr/* 25334206Sdyson * Vnode op for VM putpages. 25434096Smsmith */ 25534096Smsmithint 25683651Speternfs_putpages(struct vop_putpages_args *ap) 25734096Smsmith{ 25834206Sdyson struct uio uio; 25934206Sdyson struct iovec iov; 26034206Sdyson vm_offset_t kva; 26134206Sdyson struct buf *bp; 26236563Speter int iomode, must_commit, i, error, npages, count; 26346349Salc off_t offset; 26434206Sdyson int *rtvals; 26536563Speter struct vnode *vp; 26683366Sjulian struct thread *td; 26736563Speter struct ucred *cred; 26836563Speter struct nfsmount *nmp; 26946349Salc struct nfsnode *np; 27036563Speter vm_page_t *pages; 27134206Sdyson 27279224Sdillon GIANT_REQUIRED; 27379224Sdillon 27436563Speter vp = ap->a_vp; 27546349Salc np = VTONFS(vp); 27683366Sjulian td = curthread; /* XXX */ 27791406Sjhb cred = curthread->td_ucred; /* XXX */ 27836563Speter nmp = VFSTONFS(vp->v_mount); 27936563Speter pages = ap->a_m; 28036563Speter count = ap->a_count; 28134206Sdyson rtvals = ap->a_rtvals; 28236563Speter npages = btoc(count); 28346349Salc offset = IDX_TO_OFF(pages[0]->pindex); 28434206Sdyson 28536563Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 28676827Salfred (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 28783366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 28876827Salfred } 28934206Sdyson 290138899Sps if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && 291138899Sps (vp->v_type == VREG)) 292138899Sps printf("nfs_putpages: called on noncache-able vnode??\n"); 293138899Sps 29483651Speter for (i = 0; i < npages; i++) 29534206Sdyson rtvals[i] = VM_PAGER_AGAIN; 29634206Sdyson 29734206Sdyson /* 29846349Salc * When putting pages, do not extend file past EOF. 29946349Salc */ 30046349Salc 30146349Salc if (offset + count > np->n_size) { 30246349Salc count = np->n_size - offset; 30346349Salc if (count < 0) 30446349Salc count = 0; 30546349Salc } 30646349Salc 30746349Salc /* 30834206Sdyson * We use only the kva address for the buffer, but this is extremely 30934206Sdyson * convienient and fast. 31034206Sdyson */ 31142957Sdillon bp = getpbuf(&nfs_pbuf_freecnt); 31234206Sdyson 31334206Sdyson kva = (vm_offset_t) bp->b_data; 31436563Speter pmap_qenter(kva, pages, npages); 31579247Sjhb cnt.v_vnodeout++; 31679247Sjhb cnt.v_vnodepgsout += count; 31734206Sdyson 31834206Sdyson iov.iov_base = (caddr_t) kva; 31936563Speter iov.iov_len = count; 32034206Sdyson uio.uio_iov = &iov; 32134206Sdyson uio.uio_iovcnt = 1; 32246349Salc uio.uio_offset = offset; 32336563Speter uio.uio_resid = count; 32434206Sdyson uio.uio_segflg = UIO_SYSSPACE; 32534206Sdyson uio.uio_rw = UIO_WRITE; 32683366Sjulian uio.uio_td = td; 32734206Sdyson 32834206Sdyson if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 32934206Sdyson iomode = NFSV3WRITE_UNSTABLE; 33034206Sdyson else 33134206Sdyson iomode = NFSV3WRITE_FILESYNC; 33234206Sdyson 333122953Salfred error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit); 33434206Sdyson 33534206Sdyson pmap_qremove(kva, npages); 33642957Sdillon relpbuf(bp, &nfs_pbuf_freecnt); 33734206Sdyson 33834206Sdyson if (!error) { 33936563Speter int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 34034206Sdyson for (i = 0; i < nwritten; i++) { 34134206Sdyson rtvals[i] = VM_PAGER_OK; 34249945Salc vm_page_undirty(pages[i]); 34334206Sdyson } 34476827Salfred if (must_commit) { 34536563Speter nfs_clearcommit(vp->v_mount); 34676827Salfred } 34734206Sdyson } 34836563Speter return rtvals[0]; 34934096Smsmith} 35034096Smsmith 35134096Smsmith/* 3521541Srgrimes * Vnode op for read using bio 3531541Srgrimes */ 3541549Srgrimesint 35583651Speternfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) 3561541Srgrimes{ 35783651Speter struct nfsnode *np = VTONFS(vp); 35883651Speter int biosize, i; 359143822Sdas struct buf *bp, *rabp; 3601541Srgrimes struct vattr vattr; 36183366Sjulian struct thread *td; 3629336Sdfr struct nfsmount *nmp = VFSTONFS(vp->v_mount); 3635455Sdg daddr_t lbn, rabn; 36446349Salc int bcount; 36551344Sdillon int seqcount; 36646349Salc int nra, error = 0, n = 0, on = 0; 3671541Srgrimes 3681541Srgrimes#ifdef DIAGNOSTIC 3691541Srgrimes if (uio->uio_rw != UIO_READ) 3701541Srgrimes panic("nfs_read mode"); 3711541Srgrimes#endif 3721541Srgrimes if (uio->uio_resid == 0) 3731541Srgrimes return (0); 37436473Speter if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 3751541Srgrimes return (EINVAL); 37683366Sjulian td = uio->uio_td; 37751344Sdillon 37836176Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 37936176Speter (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 38083366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 38136473Speter if (vp->v_type != VDIR && 38236473Speter (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 38336473Speter return (EFBIG); 384138899Sps 385138899Sps if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG)) 386138899Sps /* No caching/ no readaheads. Just read data into the user buffer */ 387138899Sps return nfs_readrpc(vp, uio, cred); 388138899Sps 3899428Sdfr biosize = vp->v_mount->mnt_stat.f_iosize; 390108357Sdillon seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); 3911541Srgrimes /* 3921541Srgrimes * For nfs, cache consistency can only be maintained approximately. 3931541Srgrimes * Although RFC1094 does not specify the criteria, the following is 3941541Srgrimes * believed to be compatible with the reference port. 3951541Srgrimes * For nfs: 3961541Srgrimes * If the file's modify time on the server has changed since the 3971541Srgrimes * last read rpc or you have written to the file, 3981541Srgrimes * you may have lost data cache consistency with the 3991541Srgrimes * server, so flush all of the file's data out of the cache. 4001541Srgrimes * Then force a getattr rpc to ensure that you have up to date 4011541Srgrimes * attributes. 4021541Srgrimes * NB: This implies that cache data can be read when up to 4031541Srgrimes * NFS_ATTRTIMEO seconds out of date. If you find that you need current 4041541Srgrimes * attributes this could be forced by setting n_attrstamp to 0 before 4051541Srgrimes * the VOP_GETATTR() call. 4061541Srgrimes */ 40783651Speter if (np->n_flag & NMODIFIED) { 40883651Speter if (vp->v_type != VREG) { 40983651Speter if (vp->v_type != VDIR) 41083651Speter panic("nfs: bioread, not dir"); 411122953Salfred (nmp->nm_rpcops->nr_invaldir)(vp); 412140731Sphk error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 4133305Sphk if (error) 4141541Srgrimes return (error); 4151541Srgrimes } 41683651Speter np->n_attrstamp = 0; 41783651Speter error = VOP_GETATTR(vp, &vattr, cred, td); 41883651Speter if (error) 4191541Srgrimes return (error); 420138473Sps np->n_mtime = vattr.va_mtime; 42183651Speter } else { 42283651Speter error = VOP_GETATTR(vp, &vattr, cred, td); 42383651Speter if (error) 42483651Speter return (error); 425128263Speadar if ((np->n_flag & NSIZECHANGED) 426138473Sps || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) { 4279336Sdfr if (vp->v_type == VDIR) 428122953Salfred (nmp->nm_rpcops->nr_invaldir)(vp); 429140731Sphk error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 4303305Sphk if (error) 43183651Speter return (error); 432138473Sps np->n_mtime = vattr.va_mtime; 433128263Speadar np->n_flag &= ~NSIZECHANGED; 4341541Srgrimes } 43583651Speter } 43683651Speter do { 4371541Srgrimes switch (vp->v_type) { 4381541Srgrimes case VREG: 4391541Srgrimes nfsstats.biocache_reads++; 4401541Srgrimes lbn = uio->uio_offset / biosize; 4419336Sdfr on = uio->uio_offset & (biosize - 1); 4421541Srgrimes 4431541Srgrimes /* 4441541Srgrimes * Start the read ahead(s), as required. 445138644Sps * The readahead is kicked off only if sequential access 446138644Sps * is detected, based on the readahead hint (ra_expect_lbn). 4471541Srgrimes */ 448138644Sps if (nmp->nm_readahead > 0 && np->ra_expect_lbn == lbn) { 44951344Sdillon for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 45013612Smpp (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 4515455Sdg rabn = lbn + 1 + nra; 452136767Sphk if (incore(&vp->v_bufobj, rabn) == NULL) { 45383366Sjulian rabp = nfs_getcacheblk(vp, rabn, biosize, td); 454131691Salfred if (!rabp) { 455131691Salfred error = nfs_sigintr(nmp, NULL, td); 456131691Salfred return (error ? error : EINTR); 457131691Salfred } 4588692Sdg if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 45958345Sphk rabp->b_flags |= B_ASYNC; 46058345Sphk rabp->b_iocmd = BIO_READ; 4615455Sdg vfs_busy_pages(rabp, 0); 462134898Sphk if (nfs_asyncio(nmp, rabp, cred, td)) { 46358934Sphk rabp->b_flags |= B_INVAL; 46458934Sphk rabp->b_ioflags |= BIO_ERROR; 4655455Sdg vfs_unbusy_pages(rabp); 4661541Srgrimes brelse(rabp); 46755431Sdillon break; 4681541Srgrimes } 46955431Sdillon } else { 4705471Sdg brelse(rabp); 47155431Sdillon } 4721541Srgrimes } 4731541Srgrimes } 474138644Sps np->ra_expect_lbn = lbn + 1; 4751541Srgrimes } 4761541Srgrimes 477148268Sps /* Note that bcount is *not* DEV_BSIZE aligned. */ 47846349Salc bcount = biosize; 47946349Salc if ((off_t)lbn * biosize >= np->n_size) { 48046349Salc bcount = 0; 48146349Salc } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 48246349Salc bcount = np->n_size - (off_t)lbn * biosize; 4838692Sdg } 48483366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 48554605Sdillon 486131691Salfred if (!bp) { 487131691Salfred error = nfs_sigintr(nmp, NULL, td); 488131691Salfred return (error ? error : EINTR); 489131691Salfred } 49042957Sdillon 49125930Sdfr /* 49246349Salc * If B_CACHE is not set, we must issue the read. If this 49346349Salc * fails, we return an error. 49425930Sdfr */ 49546349Salc 4967871Sdg if ((bp->b_flags & B_CACHE) == 0) { 49758345Sphk bp->b_iocmd = BIO_READ; 49832755Sdyson vfs_busy_pages(bp, 0); 499134898Sphk error = nfs_doio(vp, bp, cred, td); 50032755Sdyson if (error) { 50132755Sdyson brelse(bp); 50232755Sdyson return (error); 50332755Sdyson } 5041541Srgrimes } 50546349Salc 50646349Salc /* 50746349Salc * on is the offset into the current bp. Figure out how many 50846349Salc * bytes we can copy out of the bp. Note that bcount is 50946349Salc * NOT DEV_BSIZE aligned. 51046349Salc * 51146349Salc * Then figure out how many bytes we can copy into the uio. 51246349Salc */ 51346349Salc 51446349Salc n = 0; 51546349Salc if (on < bcount) 51646349Salc n = min((unsigned)(bcount - on), uio->uio_resid); 5171541Srgrimes break; 5181541Srgrimes case VLNK: 5191541Srgrimes nfsstats.biocache_readlinks++; 52083366Sjulian bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 521131691Salfred if (!bp) { 522131691Salfred error = nfs_sigintr(nmp, NULL, td); 523131691Salfred return (error ? error : EINTR); 524131691Salfred } 5257871Sdg if ((bp->b_flags & B_CACHE) == 0) { 52658345Sphk bp->b_iocmd = BIO_READ; 52732755Sdyson vfs_busy_pages(bp, 0); 528134898Sphk error = nfs_doio(vp, bp, cred, td); 52932755Sdyson if (error) { 53058934Sphk bp->b_ioflags |= BIO_ERROR; 53132755Sdyson brelse(bp); 53232755Sdyson return (error); 53332755Sdyson } 5341541Srgrimes } 5351541Srgrimes n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 5361541Srgrimes on = 0; 5371541Srgrimes break; 5381541Srgrimes case VDIR: 5391541Srgrimes nfsstats.biocache_readdirs++; 54024577Sdfr if (np->n_direofoffset 54124577Sdfr && uio->uio_offset >= np->n_direofoffset) { 54224577Sdfr return (0); 54324577Sdfr } 54436979Sbde lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 5459336Sdfr on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 54683366Sjulian bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 547131691Salfred if (!bp) { 548131691Salfred error = nfs_sigintr(nmp, NULL, td); 549131691Salfred return (error ? error : EINTR); 550131691Salfred } 5517871Sdg if ((bp->b_flags & B_CACHE) == 0) { 55258345Sphk bp->b_iocmd = BIO_READ; 5539336Sdfr vfs_busy_pages(bp, 0); 554134898Sphk error = nfs_doio(vp, bp, cred, td); 55532912Stegge if (error) { 55632912Stegge brelse(bp); 55732912Stegge } 55832755Sdyson while (error == NFSERR_BAD_COOKIE) { 559122953Salfred (nmp->nm_rpcops->nr_invaldir)(vp); 560140731Sphk error = nfs_vinvalbuf(vp, 0, td, 1); 56132755Sdyson /* 56232755Sdyson * Yuck! The directory has been modified on the 56332755Sdyson * server. The only way to get the block is by 56432755Sdyson * reading from the beginning to get all the 56532755Sdyson * offset cookies. 56646349Salc * 56746349Salc * Leave the last bp intact unless there is an error. 56846349Salc * Loop back up to the while if the error is another 56946349Salc * NFSERR_BAD_COOKIE (double yuch!). 57032755Sdyson */ 57132755Sdyson for (i = 0; i <= lbn && !error; i++) { 57232755Sdyson if (np->n_direofoffset 57332755Sdyson && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 57424577Sdfr return (0); 57583366Sjulian bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 576131691Salfred if (!bp) { 577131691Salfred error = nfs_sigintr(nmp, NULL, td); 578131691Salfred return (error ? error : EINTR); 579131691Salfred } 58046349Salc if ((bp->b_flags & B_CACHE) == 0) { 58158345Sphk bp->b_iocmd = BIO_READ; 58246349Salc vfs_busy_pages(bp, 0); 583134898Sphk error = nfs_doio(vp, bp, cred, td); 58446349Salc /* 58546349Salc * no error + B_INVAL == directory EOF, 58646349Salc * use the block. 58746349Salc */ 58846349Salc if (error == 0 && (bp->b_flags & B_INVAL)) 58946349Salc break; 59046349Salc } 59146349Salc /* 59246349Salc * An error will throw away the block and the 59346349Salc * for loop will break out. If no error and this 59446349Salc * is not the block we want, we throw away the 59546349Salc * block and go for the next one via the for loop. 59646349Salc */ 59746349Salc if (error || i < lbn) 59832755Sdyson brelse(bp); 5991541Srgrimes } 60032912Stegge } 60146349Salc /* 60246349Salc * The above while is repeated if we hit another cookie 60346349Salc * error. If we hit an error and it wasn't a cookie error, 60446349Salc * we give up. 60546349Salc */ 60632912Stegge if (error) 6079336Sdfr return (error); 6081541Srgrimes } 6091541Srgrimes 6101541Srgrimes /* 6111541Srgrimes * If not eof and read aheads are enabled, start one. 6121541Srgrimes * (You need the current block first, so that you have the 6139336Sdfr * directory offset cookie of the next block.) 6141541Srgrimes */ 61589324Speter if (nmp->nm_readahead > 0 && 61639782Smckusick (bp->b_flags & B_INVAL) == 0 && 6179336Sdfr (np->n_direofoffset == 0 || 6189336Sdfr (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 619136767Sphk incore(&vp->v_bufobj, lbn + 1) == NULL) { 62083366Sjulian rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 6211541Srgrimes if (rabp) { 6228692Sdg if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 62358345Sphk rabp->b_flags |= B_ASYNC; 62458345Sphk rabp->b_iocmd = BIO_READ; 6255455Sdg vfs_busy_pages(rabp, 0); 626134898Sphk if (nfs_asyncio(nmp, rabp, cred, td)) { 62758934Sphk rabp->b_flags |= B_INVAL; 62858934Sphk rabp->b_ioflags |= BIO_ERROR; 6295455Sdg vfs_unbusy_pages(rabp); 6301541Srgrimes brelse(rabp); 6311541Srgrimes } 6325471Sdg } else { 6335471Sdg brelse(rabp); 6341541Srgrimes } 6351541Srgrimes } 6361541Srgrimes } 63726469Sdfr /* 63846349Salc * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 63946349Salc * chopped for the EOF condition, we cannot tell how large 64046349Salc * NFS directories are going to be until we hit EOF. So 64146349Salc * an NFS directory buffer is *not* chopped to its EOF. Now, 64246349Salc * it just so happens that b_resid will effectively chop it 64346349Salc * to EOF. *BUT* this information is lost if the buffer goes 64446349Salc * away and is reconstituted into a B_CACHE state ( due to 64546349Salc * being VMIO ) later. So we keep track of the directory eof 64683651Speter * in np->n_direofoffset and chop it off as an extra step 64746349Salc * right here. 64826469Sdfr */ 64926469Sdfr n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 65046349Salc if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 65146349Salc n = np->n_direofoffset - uio->uio_offset; 6521541Srgrimes break; 6533305Sphk default: 65483651Speter printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 655143822Sdas bp = NULL; 6563305Sphk break; 6571541Srgrimes }; 6581541Srgrimes 6591541Srgrimes if (n > 0) { 66034206Sdyson error = uiomove(bp->b_data + on, (int)n, uio); 6611541Srgrimes } 662143822Sdas if (vp->v_type == VLNK) 6631541Srgrimes n = 0; 664143822Sdas if (bp != NULL) 665143822Sdas brelse(bp); 6661541Srgrimes } while (error == 0 && uio->uio_resid > 0 && n > 0); 6671541Srgrimes return (error); 6681541Srgrimes} 6691541Srgrimes 6701541Srgrimes/* 671138899Sps * The NFS write path cannot handle iovecs with len > 1. So we need to 672138899Sps * break up iovecs accordingly (restricting them to wsize). 673138899Sps * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). 674138899Sps * For the ASYNC case, 2 copies are needed. The first a copy from the 675138899Sps * user buffer to a staging buffer and then a second copy from the staging 676138899Sps * buffer to mbufs. This can be optimized by copying from the user buffer 677138899Sps * directly into mbufs and passing the chain down, but that requires a 678138899Sps * fair amount of re-working of the relevant codepaths (and can be done 679138899Sps * later). 680138899Sps */ 681138899Spsstatic int 682138899Spsnfs_directio_write(vp, uiop, cred, ioflag) 683138899Sps struct vnode *vp; 684138899Sps struct uio *uiop; 685138899Sps struct ucred *cred; 686138899Sps int ioflag; 687138899Sps{ 688138899Sps int error; 689138899Sps struct nfsmount *nmp = VFSTONFS(vp->v_mount); 690138899Sps struct thread *td = uiop->uio_td; 691138899Sps int size; 692138899Sps 693138899Sps if (ioflag & IO_SYNC) { 694138899Sps int iomode, must_commit; 695138899Sps struct uio uio; 696138899Sps struct iovec iov; 697138899Spsdo_sync: 698138899Sps while (uiop->uio_resid > 0) { 699138899Sps size = min(uiop->uio_resid, nmp->nm_wsize); 700138899Sps size = min(uiop->uio_iov->iov_len, size); 701138899Sps iov.iov_base = uiop->uio_iov->iov_base; 702138899Sps iov.iov_len = size; 703138899Sps uio.uio_iov = &iov; 704138899Sps uio.uio_iovcnt = 1; 705138899Sps uio.uio_offset = uiop->uio_offset; 706138899Sps uio.uio_resid = size; 707138899Sps uio.uio_segflg = UIO_USERSPACE; 708138899Sps uio.uio_rw = UIO_WRITE; 709138899Sps uio.uio_td = td; 710138899Sps iomode = NFSV3WRITE_FILESYNC; 711138899Sps error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, 712138899Sps &iomode, &must_commit); 713138899Sps KASSERT((must_commit == 0), 714138899Sps ("nfs_directio_write: Did not commit write")); 715138899Sps if (error) 716138899Sps return (error); 717138899Sps uiop->uio_offset += size; 718138899Sps uiop->uio_resid -= size; 719138899Sps if (uiop->uio_iov->iov_len <= size) { 720138899Sps uiop->uio_iovcnt--; 721138899Sps uiop->uio_iov++; 722138899Sps } else { 723138899Sps uiop->uio_iov->iov_base = 724138899Sps (char *)uiop->uio_iov->iov_base + size; 725138899Sps uiop->uio_iov->iov_len -= size; 726138899Sps } 727138899Sps } 728138899Sps } else { 729138899Sps struct uio *t_uio; 730138899Sps struct iovec *t_iov; 731138899Sps struct buf *bp; 732138899Sps 733138899Sps /* 734138899Sps * Break up the write into blocksize chunks and hand these 735138899Sps * over to nfsiod's for write back. 736138899Sps * Unfortunately, this incurs a copy of the data. Since 737138899Sps * the user could modify the buffer before the write is 738138899Sps * initiated. 739138899Sps * 740138899Sps * The obvious optimization here is that one of the 2 copies 741138899Sps * in the async write path can be eliminated by copying the 742138899Sps * data here directly into mbufs and passing the mbuf chain 743138899Sps * down. But that will require a fair amount of re-working 744138899Sps * of the code and can be done if there's enough interest 745138899Sps * in NFS directio access. 746138899Sps */ 747138899Sps while (uiop->uio_resid > 0) { 748138899Sps size = min(uiop->uio_resid, nmp->nm_wsize); 749138899Sps size = min(uiop->uio_iov->iov_len, size); 750138899Sps bp = getpbuf(&nfs_pbuf_freecnt); 751138899Sps t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); 752138899Sps t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); 753138899Sps t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); 754138899Sps t_iov->iov_len = size; 755138899Sps t_uio->uio_iov = t_iov; 756138899Sps t_uio->uio_iovcnt = 1; 757138899Sps t_uio->uio_offset = uiop->uio_offset; 758138899Sps t_uio->uio_resid = size; 759138899Sps t_uio->uio_segflg = UIO_SYSSPACE; 760138899Sps t_uio->uio_rw = UIO_WRITE; 761138899Sps t_uio->uio_td = td; 762138899Sps bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size); 763138899Sps bp->b_flags |= B_DIRECT; 764138899Sps bp->b_iocmd = BIO_WRITE; 765138899Sps if (cred != NOCRED) { 766138899Sps crhold(cred); 767138899Sps bp->b_wcred = cred; 768138899Sps } else 769138899Sps bp->b_wcred = NOCRED; 770138899Sps bp->b_caller1 = (void *)t_uio; 771138899Sps bp->b_vp = vp; 772138899Sps vhold(vp); 773138899Sps error = nfs_asyncio(nmp, bp, NOCRED, td); 774138899Sps if (error) { 775138899Sps free(t_iov->iov_base, M_NFSDIRECTIO); 776138899Sps free(t_iov, M_NFSDIRECTIO); 777138899Sps free(t_uio, M_NFSDIRECTIO); 778138899Sps vdrop(bp->b_vp); 779138899Sps bp->b_vp = NULL; 780138899Sps relpbuf(bp, &nfs_pbuf_freecnt); 781138899Sps if (error == EINTR) 782138899Sps return (error); 783138899Sps goto do_sync; 784138899Sps } 785138899Sps uiop->uio_offset += size; 786138899Sps uiop->uio_resid -= size; 787138899Sps if (uiop->uio_iov->iov_len <= size) { 788138899Sps uiop->uio_iovcnt--; 789138899Sps uiop->uio_iov++; 790138899Sps } else { 791138899Sps uiop->uio_iov->iov_base = 792138899Sps (char *)uiop->uio_iov->iov_base + size; 793138899Sps uiop->uio_iov->iov_len -= size; 794138899Sps } 795138899Sps } 796138899Sps } 797138899Sps return (0); 798138899Sps} 799138899Sps 800138899Sps/* 8011541Srgrimes * Vnode op for write using bio 8021541Srgrimes */ 8031549Srgrimesint 80483651Speternfs_write(struct vop_write_args *ap) 8051541Srgrimes{ 80646349Salc int biosize; 80746349Salc struct uio *uio = ap->a_uio; 80883366Sjulian struct thread *td = uio->uio_td; 80946349Salc struct vnode *vp = ap->a_vp; 8101541Srgrimes struct nfsnode *np = VTONFS(vp); 81146349Salc struct ucred *cred = ap->a_cred; 8121541Srgrimes int ioflag = ap->a_ioflag; 8131541Srgrimes struct buf *bp; 8141541Srgrimes struct vattr vattr; 8159336Sdfr struct nfsmount *nmp = VFSTONFS(vp->v_mount); 81611921Sphk daddr_t lbn; 81746349Salc int bcount; 81883651Speter int n, on, error = 0; 81983366Sjulian struct proc *p = td?td->td_proc:NULL; 8201541Srgrimes 82179224Sdillon GIANT_REQUIRED; 82279224Sdillon 8231541Srgrimes#ifdef DIAGNOSTIC 8241541Srgrimes if (uio->uio_rw != UIO_WRITE) 8251541Srgrimes panic("nfs_write mode"); 82683366Sjulian if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) 8271541Srgrimes panic("nfs_write proc"); 8281541Srgrimes#endif 8291541Srgrimes if (vp->v_type != VREG) 8301541Srgrimes return (EIO); 8311541Srgrimes if (np->n_flag & NWRITEERR) { 8321541Srgrimes np->n_flag &= ~NWRITEERR; 8331541Srgrimes return (np->n_error); 8341541Srgrimes } 83536176Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 83636176Speter (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 83783366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 83854605Sdillon 83954605Sdillon /* 84054605Sdillon * Synchronously flush pending buffers if we are in synchronous 84154605Sdillon * mode or if we are appending. 84254605Sdillon */ 8431541Srgrimes if (ioflag & (IO_APPEND | IO_SYNC)) { 8441541Srgrimes if (np->n_flag & NMODIFIED) { 845147420Sgreen#ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */ 846147280Sgreen /* 847147280Sgreen * Require non-blocking, synchronous writes to 848147280Sgreen * dirty files to inform the program it needs 849147280Sgreen * to fsync(2) explicitly. 850147280Sgreen */ 851147280Sgreen if (ioflag & IO_NDELAY) 852147280Sgreen return (EAGAIN); 853147420Sgreen#endif 854147280Sgreenflush_and_restart: 8551541Srgrimes np->n_attrstamp = 0; 856140731Sphk error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 8573305Sphk if (error) 8581541Srgrimes return (error); 8591541Srgrimes } 8601541Srgrimes } 86154605Sdillon 86254605Sdillon /* 86354605Sdillon * If IO_APPEND then load uio_offset. We restart here if we cannot 86454605Sdillon * get the append lock. 86554605Sdillon */ 86654605Sdillon if (ioflag & IO_APPEND) { 86754605Sdillon np->n_attrstamp = 0; 86883366Sjulian error = VOP_GETATTR(vp, &vattr, cred, td); 86954605Sdillon if (error) 87054605Sdillon return (error); 87154605Sdillon uio->uio_offset = np->n_size; 87254605Sdillon } 87354605Sdillon 8741541Srgrimes if (uio->uio_offset < 0) 8751541Srgrimes return (EINVAL); 87636473Speter if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 87736473Speter return (EFBIG); 8781541Srgrimes if (uio->uio_resid == 0) 8791541Srgrimes return (0); 88054605Sdillon 881138899Sps if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG) 882138899Sps return nfs_directio_write(vp, uio, cred, ioflag); 883138899Sps 8841541Srgrimes /* 8851541Srgrimes * Maybe this should be above the vnode op call, but so long as 8861541Srgrimes * file servers have no limits, i don't think it matters 8871541Srgrimes */ 888125454Sjhb if (p != NULL) { 88973929Sjhb PROC_LOCK(p); 890125454Sjhb if (uio->uio_offset + uio->uio_resid > 891125454Sjhb lim_cur(p, RLIMIT_FSIZE)) { 892125454Sjhb psignal(p, SIGXFSZ); 893125454Sjhb PROC_UNLOCK(p); 894125454Sjhb return (EFBIG); 895125454Sjhb } 89673929Sjhb PROC_UNLOCK(p); 8971541Srgrimes } 89846349Salc 8999428Sdfr biosize = vp->v_mount->mnt_stat.f_iosize; 900147280Sgreen /* 901147280Sgreen * Find all of this file's B_NEEDCOMMIT buffers. If our writes 902147280Sgreen * would exceed the local maximum per-file write commit size when 903147280Sgreen * combined with those, we must decide whether to flush, 904147280Sgreen * go synchronous, or return error. We don't bother checking 905147280Sgreen * IO_UNIT -- we just make all writes atomic anyway, as there's 906147280Sgreen * no point optimizing for something that really won't ever happen. 907147280Sgreen */ 908147280Sgreen if (!(ioflag & IO_SYNC)) { 909147280Sgreen int needrestart = 0; 910147280Sgreen if (nmp->nm_wcommitsize < uio->uio_resid) { 911147280Sgreen /* 912147280Sgreen * If this request could not possibly be completed 913147280Sgreen * without exceeding the maximum outstanding write 914147280Sgreen * commit size, see if we can convert it into a 915147280Sgreen * synchronous write operation. 916147280Sgreen */ 917147280Sgreen if (ioflag & IO_NDELAY) 918147280Sgreen return (EAGAIN); 919147280Sgreen ioflag |= IO_SYNC; 920147280Sgreen if (np->n_flag & NMODIFIED) 921147280Sgreen needrestart = 1; 922147280Sgreen } else if (np->n_flag & NMODIFIED) { 923147280Sgreen int wouldcommit = 0; 924147280Sgreen BO_LOCK(&vp->v_bufobj); 925147280Sgreen if (vp->v_bufobj.bo_dirty.bv_cnt != 0) { 926147280Sgreen TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, 927147280Sgreen b_bobufs) { 928147280Sgreen if (bp->b_flags & B_NEEDCOMMIT) 929147280Sgreen wouldcommit += bp->b_bcount; 930147280Sgreen } 931147280Sgreen } 932147280Sgreen BO_UNLOCK(&vp->v_bufobj); 933147280Sgreen /* 934147280Sgreen * Since we're not operating synchronously and 935147280Sgreen * bypassing the buffer cache, we are in a commit 936147280Sgreen * and holding all of these buffers whether 937147280Sgreen * transmitted or not. If not limited, this 938147280Sgreen * will lead to the buffer cache deadlocking, 939147280Sgreen * as no one else can flush our uncommitted buffers. 940147280Sgreen */ 941147280Sgreen wouldcommit += uio->uio_resid; 942147280Sgreen /* 943147280Sgreen * If we would initially exceed the maximum 944147280Sgreen * outstanding write commit size, flush and restart. 945147280Sgreen */ 946147280Sgreen if (wouldcommit > nmp->nm_wcommitsize) 947147280Sgreen needrestart = 1; 948147280Sgreen } 949148268Sps if (needrestart) 950147280Sgreen goto flush_and_restart; 951147280Sgreen } 95246349Salc 9531541Srgrimes do { 9541541Srgrimes nfsstats.biocache_writes++; 9551541Srgrimes lbn = uio->uio_offset / biosize; 9561541Srgrimes on = uio->uio_offset & (biosize-1); 9571541Srgrimes n = min((unsigned)(biosize - on), uio->uio_resid); 9581541Srgrimesagain: 95946349Salc /* 96046349Salc * Handle direct append and file extension cases, calculate 96146349Salc * unaligned buffer size. 96246349Salc */ 96346349Salc 96446349Salc if (uio->uio_offset == np->n_size && n) { 96546349Salc /* 96654605Sdillon * Get the buffer (in its pre-append state to maintain 96754605Sdillon * B_CACHE if it was previously set). Resize the 96854605Sdillon * nfsnode after we have locked the buffer to prevent 96954605Sdillon * readers from reading garbage. 97046349Salc */ 97146349Salc bcount = on; 97283366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 97346349Salc 97454605Sdillon if (bp != NULL) { 97554605Sdillon long save; 97646349Salc 97754605Sdillon np->n_size = uio->uio_offset + n; 97854605Sdillon np->n_flag |= NMODIFIED; 97954605Sdillon vnode_pager_setsize(vp, np->n_size); 98054605Sdillon 98154605Sdillon save = bp->b_flags & B_CACHE; 98254605Sdillon bcount += n; 98354605Sdillon allocbuf(bp, bcount); 98454605Sdillon bp->b_flags |= save; 98554605Sdillon } 98646349Salc } else { 98754605Sdillon /* 98883651Speter * Obtain the locked cache block first, and then 98954605Sdillon * adjust the file's size as appropriate. 99054605Sdillon */ 99154605Sdillon bcount = on + n; 99254605Sdillon if ((off_t)lbn * biosize + bcount < np->n_size) { 99354605Sdillon if ((off_t)(lbn + 1) * biosize < np->n_size) 99454605Sdillon bcount = biosize; 99554605Sdillon else 99654605Sdillon bcount = np->n_size - (off_t)lbn * biosize; 99754605Sdillon } 99883366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 99946349Salc if (uio->uio_offset + n > np->n_size) { 100046349Salc np->n_size = uio->uio_offset + n; 100146349Salc np->n_flag |= NMODIFIED; 100246349Salc vnode_pager_setsize(vp, np->n_size); 100346349Salc } 10048692Sdg } 100546349Salc 100654605Sdillon if (!bp) { 1007131691Salfred error = nfs_sigintr(nmp, NULL, td); 1008131691Salfred if (!error) 1009131691Salfred error = EINTR; 101054605Sdillon break; 101154605Sdillon } 101254605Sdillon 101346349Salc /* 101446349Salc * Issue a READ if B_CACHE is not set. In special-append 101546349Salc * mode, B_CACHE is based on the buffer prior to the write 101646349Salc * op and is typically set, avoiding the read. If a read 101746349Salc * is required in special append mode, the server will 101846349Salc * probably send us a short-read since we extended the file 101983651Speter * on our end, resulting in b_resid == 0 and, thusly, 102046349Salc * B_CACHE getting set. 102146349Salc * 102246349Salc * We can also avoid issuing the read if the write covers 102346349Salc * the entire buffer. We have to make sure the buffer state 102446349Salc * is reasonable in this case since we will not be initiating 102546349Salc * I/O. See the comments in kern/vfs_bio.c's getblk() for 102646349Salc * more information. 102746349Salc * 102846349Salc * B_CACHE may also be set due to the buffer being cached 102946349Salc * normally. 103046349Salc */ 103146349Salc 103246349Salc if (on == 0 && n == bcount) { 103346349Salc bp->b_flags |= B_CACHE; 103458934Sphk bp->b_flags &= ~B_INVAL; 103558934Sphk bp->b_ioflags &= ~BIO_ERROR; 10368692Sdg } 103746349Salc 103846349Salc if ((bp->b_flags & B_CACHE) == 0) { 103958345Sphk bp->b_iocmd = BIO_READ; 104046349Salc vfs_busy_pages(bp, 0); 1041134898Sphk error = nfs_doio(vp, bp, cred, td); 104246349Salc if (error) { 104346349Salc brelse(bp); 104454605Sdillon break; 104546349Salc } 104646349Salc } 104784827Sjhb if (bp->b_wcred == NOCRED) 104884827Sjhb bp->b_wcred = crhold(cred); 10491541Srgrimes np->n_flag |= NMODIFIED; 10508692Sdg 105145347Sjulian /* 105254605Sdillon * If dirtyend exceeds file size, chop it down. This should 105354605Sdillon * not normally occur but there is an append race where it 105483651Speter * might occur XXX, so we log it. 105554605Sdillon * 105654605Sdillon * If the chopping creates a reverse-indexed or degenerate 105754605Sdillon * situation with dirtyoff/end, we 0 both of them. 105845347Sjulian */ 105945347Sjulian 106054605Sdillon if (bp->b_dirtyend > bcount) { 106183651Speter printf("NFS append race @%lx:%d\n", 106283651Speter (long)bp->b_blkno * DEV_BSIZE, 106354605Sdillon bp->b_dirtyend - bcount); 106454605Sdillon bp->b_dirtyend = bcount; 106554605Sdillon } 106654605Sdillon 106745347Sjulian if (bp->b_dirtyoff >= bp->b_dirtyend) 106845347Sjulian bp->b_dirtyoff = bp->b_dirtyend = 0; 106931617Sdyson 10701541Srgrimes /* 107131617Sdyson * If the new write will leave a contiguous dirty 107231617Sdyson * area, just update the b_dirtyoff and b_dirtyend, 107331617Sdyson * otherwise force a write rpc of the old dirty area. 107446349Salc * 107583651Speter * While it is possible to merge discontiguous writes due to 107646349Salc * our having a B_CACHE buffer ( and thus valid read data 107783651Speter * for the hole), we don't because it could lead to 107846349Salc * significant cache coherency problems with multiple clients, 107946349Salc * especially if locking is implemented later on. 108046349Salc * 108146349Salc * as an optimization we could theoretically maintain 108246349Salc * a linked list of discontinuous areas, but we would still 108346349Salc * have to commit them separately so there isn't much 108446349Salc * advantage to it except perhaps a bit of asynchronization. 108531617Sdyson */ 108642957Sdillon 108731617Sdyson if (bp->b_dirtyend > 0 && 108831617Sdyson (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1089126853Sphk if (bwrite(bp) == EINTR) { 1090100194Sdillon error = EINTR; 1091100194Sdillon break; 1092100194Sdillon } 109331617Sdyson goto again; 109431617Sdyson } 109531617Sdyson 10963305Sphk error = uiomove((char *)bp->b_data + on, n, uio); 109754480Sdillon 109854480Sdillon /* 109954480Sdillon * Since this block is being modified, it must be written 110054480Sdillon * again and not just committed. Since write clustering does 110154480Sdillon * not work for the stage 1 data write, only the stage 2 110254480Sdillon * commit rpc, we have to clear B_CLUSTEROK as well. 110354480Sdillon */ 110454480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 110554480Sdillon 11063305Sphk if (error) { 110758934Sphk bp->b_ioflags |= BIO_ERROR; 11081541Srgrimes brelse(bp); 110954605Sdillon break; 11101541Srgrimes } 111134206Sdyson 111234206Sdyson /* 111383651Speter * Only update dirtyoff/dirtyend if not a degenerate 111445347Sjulian * condition. 111545347Sjulian */ 111645347Sjulian if (n) { 111745347Sjulian if (bp->b_dirtyend > 0) { 111845347Sjulian bp->b_dirtyoff = min(on, bp->b_dirtyoff); 111945347Sjulian bp->b_dirtyend = max((on + n), bp->b_dirtyend); 112045347Sjulian } else { 112145347Sjulian bp->b_dirtyoff = on; 112245347Sjulian bp->b_dirtyend = on + n; 112345347Sjulian } 112446349Salc vfs_bio_set_validclean(bp, on, n); 11251541Srgrimes } 112645347Sjulian 112744679Sjulian /* 112883651Speter * If IO_SYNC do bwrite(). 112946349Salc * 113046349Salc * IO_INVAL appears to be unused. The idea appears to be 113146349Salc * to turn off caching in this case. Very odd. XXX 11321541Srgrimes */ 113383651Speter if ((ioflag & IO_SYNC)) { 113434206Sdyson if (ioflag & IO_INVAL) 113546349Salc bp->b_flags |= B_NOCACHE; 1136126853Sphk error = bwrite(bp); 11373305Sphk if (error) 113854605Sdillon break; 113983651Speter } else if ((n + on) == biosize) { 11409336Sdfr bp->b_flags |= B_ASYNC; 1141122953Salfred (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, 0); 114246349Salc } else { 11431541Srgrimes bdwrite(bp); 114446349Salc } 11451541Srgrimes } while (uio->uio_resid > 0 && n > 0); 114654605Sdillon 114754605Sdillon return (error); 11481541Srgrimes} 11491541Srgrimes 11501541Srgrimes/* 11511541Srgrimes * Get an nfs cache block. 115254480Sdillon * 11531541Srgrimes * Allocate a new one if the block isn't currently in the cache 11541541Srgrimes * and return the block marked busy. If the calling process is 11551541Srgrimes * interrupted by a signal for an interruptible mount point, return 11561541Srgrimes * NULL. 115754480Sdillon * 115854480Sdillon * The caller must carefully deal with the possible B_INVAL state of 115954480Sdillon * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 116054480Sdillon * indirectly), so synchronous reads can be issued without worrying about 116154480Sdillon * the B_INVAL state. We have to be a little more careful when dealing 116254480Sdillon * with writes (see comments in nfs_write()) when extending a file past 116354480Sdillon * its EOF. 11641541Srgrimes */ 116512911Sphkstatic struct buf * 116683651Speternfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) 11671541Srgrimes{ 116883651Speter struct buf *bp; 116932755Sdyson struct mount *mp; 117032755Sdyson struct nfsmount *nmp; 11711541Srgrimes 117232755Sdyson mp = vp->v_mount; 117332755Sdyson nmp = VFSTONFS(mp); 117432755Sdyson 11751541Srgrimes if (nmp->nm_flag & NFSMNT_INT) { 1176138496Sps sigset_t oldset; 1177138496Sps 1178138496Sps nfs_set_sigmask(td, &oldset); 1179111856Sjeff bp = getblk(vp, bn, size, PCATCH, 0, 0); 1180138496Sps nfs_restore_sigmask(td, &oldset); 118199797Sdillon while (bp == NULL) { 118299797Sdillon if (nfs_sigintr(nmp, NULL, td)) 118399797Sdillon return (NULL); 1184111856Sjeff bp = getblk(vp, bn, size, 0, 2 * hz, 0); 11851541Srgrimes } 118646349Salc } else { 1187111856Sjeff bp = getblk(vp, bn, size, 0, 0, 0); 118846349Salc } 11895455Sdg 119041791Sdt if (vp->v_type == VREG) { 119132755Sdyson int biosize; 119246349Salc 119332755Sdyson biosize = mp->mnt_stat.f_iosize; 119441791Sdt bp->b_blkno = bn * (biosize / DEV_BSIZE); 119532755Sdyson } 11961541Srgrimes return (bp); 11971541Srgrimes} 11981541Srgrimes 11991541Srgrimes/* 12001541Srgrimes * Flush and invalidate all dirty buffers. If another process is already 12011541Srgrimes * doing the flush, just wait for completion. 12021541Srgrimes */ 12031549Srgrimesint 1204140731Sphknfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg) 12051541Srgrimes{ 120683651Speter struct nfsnode *np = VTONFS(vp); 12071541Srgrimes struct nfsmount *nmp = VFSTONFS(vp->v_mount); 12081541Srgrimes int error = 0, slpflag, slptimeo; 1209138469Sps int old_lock = 0; 12101541Srgrimes 1211115041Srwatson ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf"); 1212115041Srwatson 1213120730Sjeff /* 1214120730Sjeff * XXX This check stops us from needlessly doing a vinvalbuf when 1215120730Sjeff * being called through vclean(). It is not clear that this is 1216120730Sjeff * unsafe. 1217120730Sjeff */ 1218143510Sjeff if (vp->v_iflag & VI_DOOMED) 121932755Sdyson return (0); 122032755Sdyson 12211541Srgrimes if ((nmp->nm_flag & NFSMNT_INT) == 0) 12221541Srgrimes intrflg = 0; 12231541Srgrimes if (intrflg) { 12241541Srgrimes slpflag = PCATCH; 12251541Srgrimes slptimeo = 2 * hz; 12261541Srgrimes } else { 12271541Srgrimes slpflag = 0; 12281541Srgrimes slptimeo = 0; 12291541Srgrimes } 12301541Srgrimes 1231138469Sps if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) { 1232138469Sps if (old_lock == LK_SHARED) { 1233138469Sps /* Upgrade to exclusive lock, this might block */ 1234138469Sps vn_lock(vp, LK_UPGRADE | LK_RETRY, td); 1235138469Sps } else { 1236138469Sps vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1237138469Sps } 1238138469Sps } 1239138469Sps 12401541Srgrimes /* 12411541Srgrimes * Now, flush as required. 12421541Srgrimes */ 1243140220Sphk error = vinvalbuf(vp, flags, td, slpflag, 0); 12441541Srgrimes while (error) { 1245138469Sps if (intrflg && (error = nfs_sigintr(nmp, NULL, td))) 1246138469Sps goto out; 1247140220Sphk error = vinvalbuf(vp, flags, td, 0, slptimeo); 12481541Srgrimes } 1249138469Sps np->n_flag &= ~NMODIFIED; 1250138469Spsout: 1251138469Sps if (old_lock != LK_EXCLUSIVE) { 1252138469Sps if (old_lock == LK_SHARED) { 1253138469Sps /* Downgrade from exclusive lock, this might block */ 1254138469Sps vn_lock(vp, LK_DOWNGRADE, td); 1255138469Sps } else { 1256138469Sps VOP_UNLOCK(vp, 0, td); 1257138469Sps } 1258138469Sps } 1259138469Sps return error; 12601541Srgrimes} 12611541Srgrimes 12621541Srgrimes/* 12631541Srgrimes * Initiate asynchronous I/O. Return an error if no nfsiods are available. 12641541Srgrimes * This is mainly to avoid queueing async I/O requests when the nfsiods 12651541Srgrimes * are all hung on a dead server. 126646349Salc * 126758934Sphk * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 126846349Salc * is eventually dequeued by the async daemon, nfs_doio() *will*. 12691541Srgrimes */ 12701549Srgrimesint 1271134898Sphknfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td) 12721541Srgrimes{ 127389324Speter int iod; 127419449Sdfr int gotiod; 127519449Sdfr int slpflag = 0; 127619449Sdfr int slptimeo = 0; 1277131691Salfred int error, error2; 12781541Srgrimes 127955431Sdillon /* 128083651Speter * Commits are usually short and sweet so lets save some cpu and 128155431Sdillon * leave the async daemons for more important rpc's (such as reads 128255431Sdillon * and writes). 128355431Sdillon */ 128458345Sphk if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 128555431Sdillon (nmp->nm_bufqiods > nfs_numasync / 2)) { 128655431Sdillon return(EIO); 128755431Sdillon } 128855431Sdillon 128919449Sdfragain: 129019449Sdfr if (nmp->nm_flag & NFSMNT_INT) 129119449Sdfr slpflag = PCATCH; 129219449Sdfr gotiod = FALSE; 129319449Sdfr 129419449Sdfr /* 129519449Sdfr * Find a free iod to process this request. 129619449Sdfr */ 129789407Speter for (iod = 0; iod < nfs_numasync; iod++) 129889324Speter if (nfs_iodwant[iod]) { 129919449Sdfr gotiod = TRUE; 130025023Sdfr break; 130119449Sdfr } 130219449Sdfr 130319449Sdfr /* 130489324Speter * Try to create one if none are free. 130589324Speter */ 130689324Speter if (!gotiod) { 130789324Speter iod = nfs_nfsiodnew(); 130889324Speter if (iod != -1) 130989324Speter gotiod = TRUE; 131089324Speter } 131189324Speter 131289407Speter if (gotiod) { 131389407Speter /* 131489407Speter * Found one, so wake it up and tell it which 131589407Speter * mount to process. 131689407Speter */ 131789407Speter NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", 131889407Speter iod, nmp)); 131999797Sdillon nfs_iodwant[iod] = NULL; 132089407Speter nfs_iodmount[iod] = nmp; 132189407Speter nmp->nm_bufqiods++; 1322111748Sdes wakeup(&nfs_iodwant[iod]); 132389407Speter } 132489407Speter 132589324Speter /* 132619449Sdfr * If none are free, we may already have an iod working on this mount 132719449Sdfr * point. If so, it will process our request. 132819449Sdfr */ 132919449Sdfr if (!gotiod) { 133019449Sdfr if (nmp->nm_bufqiods > 0) { 133119449Sdfr NFS_DPF(ASYNCIO, 133219449Sdfr ("nfs_asyncio: %d iods are already processing mount %p\n", 133319449Sdfr nmp->nm_bufqiods, nmp)); 133419449Sdfr gotiod = TRUE; 133519449Sdfr } 133619449Sdfr } 133719449Sdfr 133819449Sdfr /* 133919449Sdfr * If we have an iod which can process the request, then queue 134019449Sdfr * the buffer. 134119449Sdfr */ 134219449Sdfr if (gotiod) { 134319449Sdfr /* 134455431Sdillon * Ensure that the queue never grows too large. We still want 134555431Sdillon * to asynchronize so we block rather then return EIO. 134619449Sdfr */ 134719449Sdfr while (nmp->nm_bufqlen >= 2*nfs_numasync) { 134819449Sdfr NFS_DPF(ASYNCIO, 134919449Sdfr ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 135019449Sdfr nmp->nm_bufqwant = TRUE; 1351138496Sps error = nfs_tsleep(td, &nmp->nm_bufq, slpflag | PRIBIO, 1352138496Sps "nfsaio", slptimeo); 135319449Sdfr if (error) { 1354131691Salfred error2 = nfs_sigintr(nmp, NULL, td); 1355131691Salfred if (error2) 1356131691Salfred return (error2); 135719449Sdfr if (slpflag == PCATCH) { 135819449Sdfr slpflag = 0; 135919449Sdfr slptimeo = 2 * hz; 136019449Sdfr } 136119449Sdfr } 136219449Sdfr /* 136319449Sdfr * We might have lost our iod while sleeping, 136419449Sdfr * so check and loop if nescessary. 136519449Sdfr */ 136619449Sdfr if (nmp->nm_bufqiods == 0) { 136719449Sdfr NFS_DPF(ASYNCIO, 136819449Sdfr ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 136919449Sdfr goto again; 137019449Sdfr } 137119449Sdfr } 137219449Sdfr 137358345Sphk if (bp->b_iocmd == BIO_READ) { 137484827Sjhb if (bp->b_rcred == NOCRED && cred != NOCRED) 137584827Sjhb bp->b_rcred = crhold(cred); 13761541Srgrimes } else { 137784827Sjhb if (bp->b_wcred == NOCRED && cred != NOCRED) 137884827Sjhb bp->b_wcred = crhold(cred); 13791541Srgrimes } 13808876Srgrimes 1381137846Sjeff if (bp->b_flags & B_REMFREE) 1382137846Sjeff bremfreef(bp); 138348225Smckusick BUF_KERNPROC(bp); 138419449Sdfr TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 138519449Sdfr nmp->nm_bufqlen++; 13861541Srgrimes return (0); 138719449Sdfr } 13889336Sdfr 13899336Sdfr /* 139019449Sdfr * All the iods are busy on other mounts, so return EIO to 139119449Sdfr * force the caller to process the i/o synchronously. 13929336Sdfr */ 139319449Sdfr NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 139419449Sdfr return (EIO); 13951541Srgrimes} 13961541Srgrimes 1397138899Spsvoid 1398138899Spsnfs_doio_directwrite(struct buf *bp) 1399138899Sps{ 1400138899Sps int iomode, must_commit; 1401138899Sps struct uio *uiop = (struct uio *)bp->b_caller1; 1402138899Sps char *iov_base = uiop->uio_iov->iov_base; 1403138899Sps struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount); 1404138899Sps 1405138899Sps iomode = NFSV3WRITE_FILESYNC; 1406138899Sps uiop->uio_td = NULL; /* NULL since we're in nfsiod */ 1407138899Sps (nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit); 1408138899Sps KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write")); 1409138899Sps free(iov_base, M_NFSDIRECTIO); 1410138899Sps free(uiop->uio_iov, M_NFSDIRECTIO); 1411138899Sps free(uiop, M_NFSDIRECTIO); 1412138899Sps vdrop(bp->b_vp); 1413138899Sps bp->b_vp = NULL; 1414138899Sps relpbuf(bp, &nfs_pbuf_freecnt); 1415138899Sps} 1416138899Sps 14171541Srgrimes/* 14181541Srgrimes * Do an I/O operation to/from a cache block. This may be called 14191541Srgrimes * synchronously or from an nfsiod. 14201541Srgrimes */ 14211541Srgrimesint 1422134898Sphknfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) 14231541Srgrimes{ 142444679Sjulian struct uio *uiop; 14251541Srgrimes struct nfsnode *np; 14261541Srgrimes struct nfsmount *nmp; 142746349Salc int error = 0, iomode, must_commit = 0; 14281541Srgrimes struct uio uio; 14291541Srgrimes struct iovec io; 143083651Speter struct proc *p = td ? td->td_proc : NULL; 14311541Srgrimes 14321541Srgrimes np = VTONFS(vp); 14331541Srgrimes nmp = VFSTONFS(vp->v_mount); 14341541Srgrimes uiop = &uio; 14351541Srgrimes uiop->uio_iov = &io; 14361541Srgrimes uiop->uio_iovcnt = 1; 14371541Srgrimes uiop->uio_segflg = UIO_SYSSPACE; 143883366Sjulian uiop->uio_td = td; 14391541Srgrimes 144046349Salc /* 144158934Sphk * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 144246349Salc * do this here so we do not have to do it in all the code that 144346349Salc * calls us. 144446349Salc */ 144558934Sphk bp->b_flags &= ~B_INVAL; 144658934Sphk bp->b_ioflags &= ~BIO_ERROR; 144746349Salc 144844679Sjulian KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 144944679Sjulian 1450121191Sphk if (bp->b_iocmd == BIO_READ) { 14513664Sphk io.iov_len = uiop->uio_resid = bp->b_bcount; 14523664Sphk io.iov_base = bp->b_data; 14531541Srgrimes uiop->uio_rw = UIO_READ; 145487834Sdillon 14551541Srgrimes switch (vp->v_type) { 14561541Srgrimes case VREG: 14579336Sdfr uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 14581541Srgrimes nfsstats.read_bios++; 1459122953Salfred error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr); 146087834Sdillon 14611541Srgrimes if (!error) { 14621541Srgrimes if (uiop->uio_resid) { 14631541Srgrimes /* 146446349Salc * If we had a short read with no error, we must have 146546349Salc * hit a file hole. We should zero-fill the remainder. 146646349Salc * This can also occur if the server hits the file EOF. 146746349Salc * 146883651Speter * Holes used to be able to occur due to pending 146946349Salc * writes, but that is not possible any longer. 14701541Srgrimes */ 147146349Salc int nread = bp->b_bcount - uiop->uio_resid; 147287834Sdillon int left = uiop->uio_resid; 147346349Salc 147446349Salc if (left > 0) 147546349Salc bzero((char *)bp->b_data + nread, left); 147646349Salc uiop->uio_resid = 0; 147746349Salc } 14781541Srgrimes } 1479115041Srwatson /* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */ 1480101308Sjeff if (p && (vp->v_vflag & VV_TEXT) && 1481138473Sps (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime))) { 148273929Sjhb PROC_LOCK(p); 1483136006Sdas killproc(p, "text file modification"); 148473929Sjhb PROC_UNLOCK(p); 14851541Srgrimes } 14861541Srgrimes break; 14871541Srgrimes case VLNK: 14889336Sdfr uiop->uio_offset = (off_t)0; 14891541Srgrimes nfsstats.readlink_bios++; 1490122953Salfred error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr); 14911541Srgrimes break; 14921541Srgrimes case VDIR: 14931541Srgrimes nfsstats.readdir_bios++; 14949336Sdfr uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1495122698Salfred if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) 1496122698Salfred error = nfs4_readdirrpc(vp, uiop, cr); 1497122698Salfred else { 1498122698Salfred if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { 1499122698Salfred error = nfs_readdirplusrpc(vp, uiop, cr); 1500122698Salfred if (error == NFSERR_NOTSUPP) 1501122698Salfred nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1502122698Salfred } 1503122698Salfred if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1504122698Salfred error = nfs_readdirrpc(vp, uiop, cr); 15059336Sdfr } 150646349Salc /* 150746349Salc * end-of-directory sets B_INVAL but does not generate an 150846349Salc * error. 150946349Salc */ 151039782Smckusick if (error == 0 && uiop->uio_resid == bp->b_bcount) 151139782Smckusick bp->b_flags |= B_INVAL; 15121541Srgrimes break; 15133305Sphk default: 151483651Speter printf("nfs_doio: type %x unexpected\n", vp->v_type); 15153305Sphk break; 15161541Srgrimes }; 15171541Srgrimes if (error) { 151858934Sphk bp->b_ioflags |= BIO_ERROR; 15191541Srgrimes bp->b_error = error; 15201541Srgrimes } 15211541Srgrimes } else { 152283651Speter /* 152351344Sdillon * If we only need to commit, try to commit 152451344Sdillon */ 152551344Sdillon if (bp->b_flags & B_NEEDCOMMIT) { 152651344Sdillon int retv; 152751344Sdillon off_t off; 152851344Sdillon 152951344Sdillon off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1530122953Salfred retv = (nmp->nm_rpcops->nr_commit)( 1531136927Sphk vp, off, bp->b_dirtyend-bp->b_dirtyoff, 153283366Sjulian bp->b_wcred, td); 153351344Sdillon if (retv == 0) { 153451344Sdillon bp->b_dirtyoff = bp->b_dirtyend = 0; 153554480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 153651344Sdillon bp->b_resid = 0; 153759249Sphk bufdone(bp); 153851344Sdillon return (0); 153951344Sdillon } 154051344Sdillon if (retv == NFSERR_STALEWRITEVERF) { 1541136927Sphk nfs_clearcommit(vp->v_mount); 154251344Sdillon } 154351344Sdillon } 154451344Sdillon 154551344Sdillon /* 154651344Sdillon * Setup for actual write 154751344Sdillon */ 154851344Sdillon 154941791Sdt if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 155041791Sdt bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 15518692Sdg 15528692Sdg if (bp->b_dirtyend > bp->b_dirtyoff) { 15538692Sdg io.iov_len = uiop->uio_resid = bp->b_dirtyend 15549336Sdfr - bp->b_dirtyoff; 155541791Sdt uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 15569336Sdfr + bp->b_dirtyoff; 15578692Sdg io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 15588692Sdg uiop->uio_rw = UIO_WRITE; 15598692Sdg nfsstats.write_bios++; 156044679Sjulian 156125785Sdfr if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 15629336Sdfr iomode = NFSV3WRITE_UNSTABLE; 15638692Sdg else 15649336Sdfr iomode = NFSV3WRITE_FILESYNC; 156544679Sjulian 1566122953Salfred error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit); 156751475Sdillon 156851475Sdillon /* 156951475Sdillon * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 157051475Sdillon * to cluster the buffers needing commit. This will allow 157151475Sdillon * the system to submit a single commit rpc for the whole 157283651Speter * cluster. We can do this even if the buffer is not 100% 157354480Sdillon * dirty (relative to the NFS blocksize), so we optimize the 157454480Sdillon * append-to-file-case. 157554480Sdillon * 157654480Sdillon * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 157754480Sdillon * cleared because write clustering only works for commit 157854480Sdillon * rpc's, not for the data portion of the write). 157951475Sdillon */ 158051475Sdillon 158125003Sdfr if (!error && iomode == NFSV3WRITE_UNSTABLE) { 158225003Sdfr bp->b_flags |= B_NEEDCOMMIT; 158325003Sdfr if (bp->b_dirtyoff == 0 158446349Salc && bp->b_dirtyend == bp->b_bcount) 158525003Sdfr bp->b_flags |= B_CLUSTEROK; 158644679Sjulian } else { 158754480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 158844679Sjulian } 15898692Sdg 15909336Sdfr /* 15919336Sdfr * For an interrupted write, the buffer is still valid 15929336Sdfr * and the write hasn't been pushed to the server yet, 159358934Sphk * so we can't set BIO_ERROR and report the interruption 15949336Sdfr * by setting B_EINTR. For the B_ASYNC case, B_EINTR 15959336Sdfr * is not relevant, so the rpc attempt is essentially 15969336Sdfr * a noop. For the case of a V3 write rpc not being 15979336Sdfr * committed to stable storage, the block is still 15989336Sdfr * dirty and requires either a commit rpc or another 15999336Sdfr * write rpc with iomode == NFSV3WRITE_FILESYNC before 16009336Sdfr * the block is reused. This is indicated by setting 16019336Sdfr * the B_DELWRI and B_NEEDCOMMIT flags. 160242957Sdillon * 160342957Sdillon * If the buffer is marked B_PAGING, it does not reside on 160444679Sjulian * the vp's paging queues so we cannot call bdirty(). The 160544679Sjulian * bp in this case is not an NFS cache block so we should 160644679Sjulian * be safe. XXX 16079336Sdfr */ 1608131691Salfred if (error == EINTR || error == EIO 16099336Sdfr || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 161034266Sjulian int s; 161134266Sjulian 161244679Sjulian s = splbio(); 16138692Sdg bp->b_flags &= ~(B_INVAL|B_NOCACHE); 161442957Sdillon if ((bp->b_flags & B_PAGING) == 0) { 161544679Sjulian bdirty(bp); 161644679Sjulian bp->b_flags &= ~B_DONE; 161742957Sdillon } 161847749Speter if (error && (bp->b_flags & B_ASYNC) == 0) 161932755Sdyson bp->b_flags |= B_EINTR; 162044679Sjulian splx(s); 16218692Sdg } else { 162244679Sjulian if (error) { 162358934Sphk bp->b_ioflags |= BIO_ERROR; 162444679Sjulian bp->b_error = np->n_error = error; 162544679Sjulian np->n_flag |= NWRITEERR; 162644679Sjulian } 162744679Sjulian bp->b_dirtyoff = bp->b_dirtyend = 0; 16288692Sdg } 16291541Srgrimes } else { 16308692Sdg bp->b_resid = 0; 163159249Sphk bufdone(bp); 16328692Sdg return (0); 16331541Srgrimes } 16341541Srgrimes } 16351541Srgrimes bp->b_resid = uiop->uio_resid; 16369336Sdfr if (must_commit) 163744679Sjulian nfs_clearcommit(vp->v_mount); 163859249Sphk bufdone(bp); 16391541Srgrimes return (error); 16401541Srgrimes} 164187834Sdillon 164287834Sdillon/* 164387834Sdillon * Used to aid in handling ftruncate() operations on the NFS client side. 164487834Sdillon * Truncation creates a number of special problems for NFS. We have to 164587834Sdillon * throw away VM pages and buffer cache buffers that are beyond EOF, and 164687834Sdillon * we have to properly handle VM pages or (potentially dirty) buffers 164787834Sdillon * that straddle the truncation point. 164887834Sdillon */ 164987834Sdillon 165087834Sdillonint 165187834Sdillonnfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) 165287834Sdillon{ 165387834Sdillon struct nfsnode *np = VTONFS(vp); 165487834Sdillon u_quad_t tsize = np->n_size; 165587834Sdillon int biosize = vp->v_mount->mnt_stat.f_iosize; 165687834Sdillon int error = 0; 165787834Sdillon 165887834Sdillon np->n_size = nsize; 165987834Sdillon 166087834Sdillon if (np->n_size < tsize) { 166187834Sdillon struct buf *bp; 166287834Sdillon daddr_t lbn; 166387834Sdillon int bufsize; 166487834Sdillon 166587834Sdillon /* 166687834Sdillon * vtruncbuf() doesn't get the buffer overlapping the 166787834Sdillon * truncation point. We may have a B_DELWRI and/or B_CACHE 166887834Sdillon * buffer that now needs to be truncated. 166987834Sdillon */ 167087834Sdillon error = vtruncbuf(vp, cred, td, nsize, biosize); 167187834Sdillon lbn = nsize / biosize; 167287834Sdillon bufsize = nsize & (biosize - 1); 167387834Sdillon bp = nfs_getcacheblk(vp, lbn, bufsize, td); 1674138496Sps if (!bp) 1675138496Sps return EINTR; 167687834Sdillon if (bp->b_dirtyoff > bp->b_bcount) 167787834Sdillon bp->b_dirtyoff = bp->b_bcount; 167887834Sdillon if (bp->b_dirtyend > bp->b_bcount) 167987834Sdillon bp->b_dirtyend = bp->b_bcount; 168087834Sdillon bp->b_flags |= B_RELBUF; /* don't leave garbage around */ 168187834Sdillon brelse(bp); 168287834Sdillon } else { 168387834Sdillon vnode_pager_setsize(vp, nsize); 168487834Sdillon } 168587834Sdillon return(error); 168687834Sdillon} 168787834Sdillon 1688