nfs_bio.c revision 147420
1139823Simp/*- 21541Srgrimes * Copyright (c) 1989, 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * This code is derived from software contributed to Berkeley by 61541Srgrimes * Rick Macklem at The University of Guelph. 71541Srgrimes * 81541Srgrimes * Redistribution and use in source and binary forms, with or without 91541Srgrimes * modification, are permitted provided that the following conditions 101541Srgrimes * are met: 111541Srgrimes * 1. Redistributions of source code must retain the above copyright 121541Srgrimes * notice, this list of conditions and the following disclaimer. 131541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141541Srgrimes * notice, this list of conditions and the following disclaimer in the 151541Srgrimes * documentation and/or other materials provided with the distribution. 161541Srgrimes * 4. Neither the name of the University nor the names of its contributors 171541Srgrimes * may be used to endorse or promote products derived from this software 181541Srgrimes * without specific prior written permission. 191541Srgrimes * 201541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 211541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 221541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 231541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 241541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 251541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 261541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 271541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 281541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 291541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 301541Srgrimes * SUCH DAMAGE. 311541Srgrimes * 3222521Sdyson * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 331541Srgrimes */ 341541Srgrimes 3583651Speter#include <sys/cdefs.h> 3683654Speter__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_bio.c 147420 2005-06-16 15:43:17Z green $"); 3722521Sdyson 381541Srgrimes#include <sys/param.h> 391541Srgrimes#include <sys/systm.h> 4079247Sjhb#include <sys/bio.h> 4179247Sjhb#include <sys/buf.h> 4279247Sjhb#include <sys/kernel.h> 4379247Sjhb#include <sys/mount.h> 4479247Sjhb#include <sys/proc.h> 451541Srgrimes#include <sys/resourcevar.h> 463305Sphk#include <sys/signalvar.h> 4779247Sjhb#include <sys/vmmeter.h> 481541Srgrimes#include <sys/vnode.h> 491541Srgrimes 501541Srgrimes#include <vm/vm.h> 5112662Sdg#include <vm/vm_extern.h> 5225930Sdfr#include <vm/vm_page.h> 5325930Sdfr#include <vm/vm_object.h> 5425930Sdfr#include <vm/vm_pager.h> 5525930Sdfr#include <vm/vnode_pager.h> 561541Srgrimes 57122698Salfred#include <rpc/rpcclnt.h> 58122698Salfred 591541Srgrimes#include <nfs/rpcv2.h> 609336Sdfr#include <nfs/nfsproto.h> 6183651Speter#include <nfsclient/nfs.h> 6283651Speter#include <nfsclient/nfsmount.h> 6383651Speter#include <nfsclient/nfsnode.h> 641541Srgrimes 65122698Salfred#include <nfs4client/nfs4.h> 66122698Salfred 6783651Speterstatic struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, 6883651Speter struct thread *td); 69138899Spsstatic int nfs_directio_write(struct vnode *vp, struct uio *uiop, 70138899Sps struct ucred *cred, int ioflag); 7175580Sphk 72138899Spsextern int nfs_directio_enable; 73138899Spsextern int nfs_directio_allow_mmap; 741541Srgrimes/* 7525930Sdfr * Vnode op for VM getpages. 7625930Sdfr */ 7725930Sdfrint 7883651Speternfs_getpages(struct vop_getpages_args *ap) 7925930Sdfr{ 8046349Salc int i, error, nextoff, size, toff, count, npages; 8132755Sdyson struct uio uio; 8232755Sdyson struct iovec iov; 8332755Sdyson vm_offset_t kva; 8434206Sdyson struct buf *bp; 8536563Speter struct vnode *vp; 8683366Sjulian struct thread *td; 8736563Speter struct ucred *cred; 8836563Speter struct nfsmount *nmp; 89116461Salc vm_object_t object; 9036563Speter vm_page_t *pages; 91138899Sps struct nfsnode *np; 9225930Sdfr 9379224Sdillon GIANT_REQUIRED; 9479224Sdillon 9536563Speter vp = ap->a_vp; 96138899Sps np = VTONFS(vp); 9783366Sjulian td = curthread; /* XXX */ 9891406Sjhb cred = curthread->td_ucred; /* XXX */ 9936563Speter nmp = VFSTONFS(vp->v_mount); 10036563Speter pages = ap->a_m; 10136563Speter count = ap->a_count; 10236563Speter 103116461Salc if ((object = vp->v_object) == NULL) { 10432286Sdyson printf("nfs_getpages: called with non-merged cache vnode??\n"); 10536563Speter return VM_PAGER_ERROR; 10625930Sdfr } 10725930Sdfr 108138899Sps if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && 109138899Sps (vp->v_type == VREG)) { 110138899Sps printf("nfs_getpages: called on non-cacheable vnode??\n"); 111138899Sps return VM_PAGER_ERROR; 112138899Sps } 113138899Sps 11436563Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 11576827Salfred (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 116122698Salfred /* We'll never get here for v4, because we always have fsinfo */ 11783366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 11876827Salfred } 11946349Salc 12046349Salc npages = btoc(count); 12146349Salc 12234206Sdyson /* 12346349Salc * If the requested page is partially valid, just return it and 12446349Salc * allow the pager to zero-out the blanks. Partially valid pages 12546349Salc * can only occur at the file EOF. 12646349Salc */ 12746349Salc 12846349Salc { 12946349Salc vm_page_t m = pages[ap->a_reqpage]; 13046349Salc 131116461Salc VM_OBJECT_LOCK(object); 132100450Salc vm_page_lock_queues(); 13346349Salc if (m->valid != 0) { 13446349Salc /* handled by vm_fault now */ 13546349Salc /* vm_page_zero_invalid(m, TRUE); */ 13646349Salc for (i = 0; i < npages; ++i) { 13746349Salc if (i != ap->a_reqpage) 13875692Salfred vm_page_free(pages[i]); 13946349Salc } 140100450Salc vm_page_unlock_queues(); 141116461Salc VM_OBJECT_UNLOCK(object); 14246349Salc return(0); 14346349Salc } 144100450Salc vm_page_unlock_queues(); 145116461Salc VM_OBJECT_UNLOCK(object); 14646349Salc } 14746349Salc 14846349Salc /* 14934206Sdyson * We use only the kva address for the buffer, but this is extremely 15034206Sdyson * convienient and fast. 15134206Sdyson */ 15242957Sdillon bp = getpbuf(&nfs_pbuf_freecnt); 15325930Sdfr 15434206Sdyson kva = (vm_offset_t) bp->b_data; 15536563Speter pmap_qenter(kva, pages, npages); 15679247Sjhb cnt.v_vnodein++; 15779247Sjhb cnt.v_vnodepgsin += npages; 15834206Sdyson 15932755Sdyson iov.iov_base = (caddr_t) kva; 16036563Speter iov.iov_len = count; 16132755Sdyson uio.uio_iov = &iov; 16232755Sdyson uio.uio_iovcnt = 1; 16336563Speter uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 16436563Speter uio.uio_resid = count; 16532755Sdyson uio.uio_segflg = UIO_SYSSPACE; 16632755Sdyson uio.uio_rw = UIO_READ; 16783366Sjulian uio.uio_td = td; 16825930Sdfr 169122953Salfred error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred); 17034206Sdyson pmap_qremove(kva, npages); 17132755Sdyson 17242957Sdillon relpbuf(bp, &nfs_pbuf_freecnt); 17334206Sdyson 17442957Sdillon if (error && (uio.uio_resid == count)) { 17542957Sdillon printf("nfs_getpages: error %d\n", error); 176116461Salc VM_OBJECT_LOCK(object); 177100450Salc vm_page_lock_queues(); 17842957Sdillon for (i = 0; i < npages; ++i) { 17942957Sdillon if (i != ap->a_reqpage) 18075692Salfred vm_page_free(pages[i]); 18142957Sdillon } 182100450Salc vm_page_unlock_queues(); 183116461Salc VM_OBJECT_UNLOCK(object); 18434206Sdyson return VM_PAGER_ERROR; 18542957Sdillon } 18634206Sdyson 18745347Sjulian /* 18845347Sjulian * Calculate the number of bytes read and validate only that number 18945347Sjulian * of bytes. Note that due to pending writes, size may be 0. This 19045347Sjulian * does not mean that the remaining data is invalid! 19145347Sjulian */ 19245347Sjulian 19336563Speter size = count - uio.uio_resid; 194116461Salc VM_OBJECT_LOCK(object); 195100450Salc vm_page_lock_queues(); 19634206Sdyson for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 19734206Sdyson vm_page_t m; 19834206Sdyson nextoff = toff + PAGE_SIZE; 19936563Speter m = pages[i]; 20034206Sdyson 20134206Sdyson if (nextoff <= size) { 20245347Sjulian /* 20345347Sjulian * Read operation filled an entire page 20445347Sjulian */ 20534206Sdyson m->valid = VM_PAGE_BITS_ALL; 20649945Salc vm_page_undirty(m); 20745347Sjulian } else if (size > toff) { 20845347Sjulian /* 20946349Salc * Read operation filled a partial page. 21045347Sjulian */ 21146349Salc m->valid = 0; 21245347Sjulian vm_page_set_validclean(m, 0, size - toff); 21346349Salc /* handled by vm_fault now */ 21446349Salc /* vm_page_zero_invalid(m, TRUE); */ 21587834Sdillon } else { 21687834Sdillon /* 21787834Sdillon * Read operation was short. If no error occured 21887834Sdillon * we may have hit a zero-fill section. We simply 21987834Sdillon * leave valid set to 0. 22087834Sdillon */ 22187834Sdillon ; 22234206Sdyson } 22325930Sdfr if (i != ap->a_reqpage) { 22434206Sdyson /* 22534206Sdyson * Whether or not to leave the page activated is up in 22634206Sdyson * the air, but we should put the page on a page queue 22734206Sdyson * somewhere (it already is in the object). Result: 22834206Sdyson * It appears that emperical results show that 22934206Sdyson * deactivating pages is best. 23034206Sdyson */ 23134206Sdyson 23234206Sdyson /* 23334206Sdyson * Just in case someone was asking for this page we 23434206Sdyson * now tell them that it is ok to use. 23534206Sdyson */ 23634206Sdyson if (!error) { 23734206Sdyson if (m->flags & PG_WANTED) 23834206Sdyson vm_page_activate(m); 23934206Sdyson else 24034206Sdyson vm_page_deactivate(m); 24138799Sdfr vm_page_wakeup(m); 24234206Sdyson } else { 24375692Salfred vm_page_free(m); 24434206Sdyson } 24525930Sdfr } 24625930Sdfr } 247100450Salc vm_page_unlock_queues(); 248116461Salc VM_OBJECT_UNLOCK(object); 24925930Sdfr return 0; 25025930Sdfr} 25125930Sdfr 25225930Sdfr/* 25334206Sdyson * Vnode op for VM putpages. 25434096Smsmith */ 25534096Smsmithint 25683651Speternfs_putpages(struct vop_putpages_args *ap) 25734096Smsmith{ 25834206Sdyson struct uio uio; 25934206Sdyson struct iovec iov; 26034206Sdyson vm_offset_t kva; 26134206Sdyson struct buf *bp; 26236563Speter int iomode, must_commit, i, error, npages, count; 26346349Salc off_t offset; 26434206Sdyson int *rtvals; 26536563Speter struct vnode *vp; 26683366Sjulian struct thread *td; 26736563Speter struct ucred *cred; 26836563Speter struct nfsmount *nmp; 26946349Salc struct nfsnode *np; 27036563Speter vm_page_t *pages; 27134206Sdyson 27279224Sdillon GIANT_REQUIRED; 27379224Sdillon 27436563Speter vp = ap->a_vp; 27546349Salc np = VTONFS(vp); 27683366Sjulian td = curthread; /* XXX */ 27791406Sjhb cred = curthread->td_ucred; /* XXX */ 27836563Speter nmp = VFSTONFS(vp->v_mount); 27936563Speter pages = ap->a_m; 28036563Speter count = ap->a_count; 28134206Sdyson rtvals = ap->a_rtvals; 28236563Speter npages = btoc(count); 28346349Salc offset = IDX_TO_OFF(pages[0]->pindex); 28434206Sdyson 28536563Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 28676827Salfred (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 28783366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 28876827Salfred } 28934206Sdyson 290138899Sps if (!nfs_directio_allow_mmap && (np->n_flag & NNONCACHE) && 291138899Sps (vp->v_type == VREG)) 292138899Sps printf("nfs_putpages: called on noncache-able vnode??\n"); 293138899Sps 29483651Speter for (i = 0; i < npages; i++) 29534206Sdyson rtvals[i] = VM_PAGER_AGAIN; 29634206Sdyson 29734206Sdyson /* 29846349Salc * When putting pages, do not extend file past EOF. 29946349Salc */ 30046349Salc 30146349Salc if (offset + count > np->n_size) { 30246349Salc count = np->n_size - offset; 30346349Salc if (count < 0) 30446349Salc count = 0; 30546349Salc } 30646349Salc 30746349Salc /* 30834206Sdyson * We use only the kva address for the buffer, but this is extremely 30934206Sdyson * convienient and fast. 31034206Sdyson */ 31142957Sdillon bp = getpbuf(&nfs_pbuf_freecnt); 31234206Sdyson 31334206Sdyson kva = (vm_offset_t) bp->b_data; 31436563Speter pmap_qenter(kva, pages, npages); 31579247Sjhb cnt.v_vnodeout++; 31679247Sjhb cnt.v_vnodepgsout += count; 31734206Sdyson 31834206Sdyson iov.iov_base = (caddr_t) kva; 31936563Speter iov.iov_len = count; 32034206Sdyson uio.uio_iov = &iov; 32134206Sdyson uio.uio_iovcnt = 1; 32246349Salc uio.uio_offset = offset; 32336563Speter uio.uio_resid = count; 32434206Sdyson uio.uio_segflg = UIO_SYSSPACE; 32534206Sdyson uio.uio_rw = UIO_WRITE; 32683366Sjulian uio.uio_td = td; 32734206Sdyson 32834206Sdyson if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 32934206Sdyson iomode = NFSV3WRITE_UNSTABLE; 33034206Sdyson else 33134206Sdyson iomode = NFSV3WRITE_FILESYNC; 33234206Sdyson 333122953Salfred error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit); 33434206Sdyson 33534206Sdyson pmap_qremove(kva, npages); 33642957Sdillon relpbuf(bp, &nfs_pbuf_freecnt); 33734206Sdyson 33834206Sdyson if (!error) { 33936563Speter int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 34034206Sdyson for (i = 0; i < nwritten; i++) { 34134206Sdyson rtvals[i] = VM_PAGER_OK; 34249945Salc vm_page_undirty(pages[i]); 34334206Sdyson } 34476827Salfred if (must_commit) { 34536563Speter nfs_clearcommit(vp->v_mount); 34676827Salfred } 34734206Sdyson } 34836563Speter return rtvals[0]; 34934096Smsmith} 35034096Smsmith 35134096Smsmith/* 3521541Srgrimes * Vnode op for read using bio 3531541Srgrimes */ 3541549Srgrimesint 35583651Speternfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) 3561541Srgrimes{ 35783651Speter struct nfsnode *np = VTONFS(vp); 35883651Speter int biosize, i; 359143822Sdas struct buf *bp, *rabp; 3601541Srgrimes struct vattr vattr; 36183366Sjulian struct thread *td; 3629336Sdfr struct nfsmount *nmp = VFSTONFS(vp->v_mount); 3635455Sdg daddr_t lbn, rabn; 36446349Salc int bcount; 36551344Sdillon int seqcount; 36646349Salc int nra, error = 0, n = 0, on = 0; 3671541Srgrimes 3681541Srgrimes#ifdef DIAGNOSTIC 3691541Srgrimes if (uio->uio_rw != UIO_READ) 3701541Srgrimes panic("nfs_read mode"); 3711541Srgrimes#endif 3721541Srgrimes if (uio->uio_resid == 0) 3731541Srgrimes return (0); 37436473Speter if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 3751541Srgrimes return (EINVAL); 37683366Sjulian td = uio->uio_td; 37751344Sdillon 37836176Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 37936176Speter (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 38083366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 38136473Speter if (vp->v_type != VDIR && 38236473Speter (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 38336473Speter return (EFBIG); 384138899Sps 385138899Sps if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG)) 386138899Sps /* No caching/ no readaheads. Just read data into the user buffer */ 387138899Sps return nfs_readrpc(vp, uio, cred); 388138899Sps 3899428Sdfr biosize = vp->v_mount->mnt_stat.f_iosize; 390108357Sdillon seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); 3911541Srgrimes /* 3921541Srgrimes * For nfs, cache consistency can only be maintained approximately. 3931541Srgrimes * Although RFC1094 does not specify the criteria, the following is 3941541Srgrimes * believed to be compatible with the reference port. 3951541Srgrimes * For nfs: 3961541Srgrimes * If the file's modify time on the server has changed since the 3971541Srgrimes * last read rpc or you have written to the file, 3981541Srgrimes * you may have lost data cache consistency with the 3991541Srgrimes * server, so flush all of the file's data out of the cache. 4001541Srgrimes * Then force a getattr rpc to ensure that you have up to date 4011541Srgrimes * attributes. 4021541Srgrimes * NB: This implies that cache data can be read when up to 4031541Srgrimes * NFS_ATTRTIMEO seconds out of date. If you find that you need current 4041541Srgrimes * attributes this could be forced by setting n_attrstamp to 0 before 4051541Srgrimes * the VOP_GETATTR() call. 4061541Srgrimes */ 40783651Speter if (np->n_flag & NMODIFIED) { 40883651Speter if (vp->v_type != VREG) { 40983651Speter if (vp->v_type != VDIR) 41083651Speter panic("nfs: bioread, not dir"); 411122953Salfred (nmp->nm_rpcops->nr_invaldir)(vp); 412140731Sphk error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 4133305Sphk if (error) 4141541Srgrimes return (error); 4151541Srgrimes } 41683651Speter np->n_attrstamp = 0; 41783651Speter error = VOP_GETATTR(vp, &vattr, cred, td); 41883651Speter if (error) 4191541Srgrimes return (error); 420138473Sps np->n_mtime = vattr.va_mtime; 42183651Speter } else { 42283651Speter error = VOP_GETATTR(vp, &vattr, cred, td); 42383651Speter if (error) 42483651Speter return (error); 425128263Speadar if ((np->n_flag & NSIZECHANGED) 426138473Sps || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) { 4279336Sdfr if (vp->v_type == VDIR) 428122953Salfred (nmp->nm_rpcops->nr_invaldir)(vp); 429140731Sphk error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 4303305Sphk if (error) 43183651Speter return (error); 432138473Sps np->n_mtime = vattr.va_mtime; 433128263Speadar np->n_flag &= ~NSIZECHANGED; 4341541Srgrimes } 43583651Speter } 43683651Speter do { 4371541Srgrimes switch (vp->v_type) { 4381541Srgrimes case VREG: 4391541Srgrimes nfsstats.biocache_reads++; 4401541Srgrimes lbn = uio->uio_offset / biosize; 4419336Sdfr on = uio->uio_offset & (biosize - 1); 4421541Srgrimes 4431541Srgrimes /* 4441541Srgrimes * Start the read ahead(s), as required. 445138644Sps * The readahead is kicked off only if sequential access 446138644Sps * is detected, based on the readahead hint (ra_expect_lbn). 4471541Srgrimes */ 448138644Sps if (nmp->nm_readahead > 0 && np->ra_expect_lbn == lbn) { 44951344Sdillon for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 45013612Smpp (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 4515455Sdg rabn = lbn + 1 + nra; 452136767Sphk if (incore(&vp->v_bufobj, rabn) == NULL) { 45383366Sjulian rabp = nfs_getcacheblk(vp, rabn, biosize, td); 454131691Salfred if (!rabp) { 455131691Salfred error = nfs_sigintr(nmp, NULL, td); 456131691Salfred return (error ? error : EINTR); 457131691Salfred } 4588692Sdg if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 45958345Sphk rabp->b_flags |= B_ASYNC; 46058345Sphk rabp->b_iocmd = BIO_READ; 4615455Sdg vfs_busy_pages(rabp, 0); 462134898Sphk if (nfs_asyncio(nmp, rabp, cred, td)) { 46358934Sphk rabp->b_flags |= B_INVAL; 46458934Sphk rabp->b_ioflags |= BIO_ERROR; 4655455Sdg vfs_unbusy_pages(rabp); 4661541Srgrimes brelse(rabp); 46755431Sdillon break; 4681541Srgrimes } 46955431Sdillon } else { 4705471Sdg brelse(rabp); 47155431Sdillon } 4721541Srgrimes } 4731541Srgrimes } 474138644Sps np->ra_expect_lbn = lbn + 1; 4751541Srgrimes } 4761541Srgrimes 4771541Srgrimes /* 47846349Salc * Obtain the buffer cache block. Figure out the buffer size 47954605Sdillon * when we are at EOF. If we are modifying the size of the 48083651Speter * buffer based on an EOF condition we need to hold 48154605Sdillon * nfs_rslock() through obtaining the buffer to prevent 48254605Sdillon * a potential writer-appender from messing with n_size. 48354605Sdillon * Otherwise we may accidently truncate the buffer and 48454605Sdillon * lose dirty data. 48546349Salc * 48646349Salc * Note that bcount is *not* DEV_BSIZE aligned. 4871541Srgrimes */ 48846349Salc 48954605Sdillonagain: 49046349Salc bcount = biosize; 49146349Salc if ((off_t)lbn * biosize >= np->n_size) { 49246349Salc bcount = 0; 49346349Salc } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 49446349Salc bcount = np->n_size - (off_t)lbn * biosize; 4958692Sdg } 49655431Sdillon if (bcount != biosize) { 49783366Sjulian switch(nfs_rslock(np, td)) { 49855431Sdillon case ENOLCK: 49955431Sdillon goto again; 50055431Sdillon /* not reached */ 501131691Salfred case EIO: 502131691Salfred return (EIO); 50355431Sdillon case EINTR: 50455431Sdillon case ERESTART: 50555431Sdillon return(EINTR); 50655431Sdillon /* not reached */ 50755431Sdillon default: 50855431Sdillon break; 50955431Sdillon } 51055431Sdillon } 51146349Salc 51283366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 51354605Sdillon 51454605Sdillon if (bcount != biosize) 51583366Sjulian nfs_rsunlock(np, td); 516131691Salfred if (!bp) { 517131691Salfred error = nfs_sigintr(nmp, NULL, td); 518131691Salfred return (error ? error : EINTR); 519131691Salfred } 52042957Sdillon 52125930Sdfr /* 52246349Salc * If B_CACHE is not set, we must issue the read. If this 52346349Salc * fails, we return an error. 52425930Sdfr */ 52546349Salc 5267871Sdg if ((bp->b_flags & B_CACHE) == 0) { 52758345Sphk bp->b_iocmd = BIO_READ; 52832755Sdyson vfs_busy_pages(bp, 0); 529134898Sphk error = nfs_doio(vp, bp, cred, td); 53032755Sdyson if (error) { 53132755Sdyson brelse(bp); 53232755Sdyson return (error); 53332755Sdyson } 5341541Srgrimes } 53546349Salc 53646349Salc /* 53746349Salc * on is the offset into the current bp. Figure out how many 53846349Salc * bytes we can copy out of the bp. Note that bcount is 53946349Salc * NOT DEV_BSIZE aligned. 54046349Salc * 54146349Salc * Then figure out how many bytes we can copy into the uio. 54246349Salc */ 54346349Salc 54446349Salc n = 0; 54546349Salc if (on < bcount) 54646349Salc n = min((unsigned)(bcount - on), uio->uio_resid); 5471541Srgrimes break; 5481541Srgrimes case VLNK: 5491541Srgrimes nfsstats.biocache_readlinks++; 55083366Sjulian bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 551131691Salfred if (!bp) { 552131691Salfred error = nfs_sigintr(nmp, NULL, td); 553131691Salfred return (error ? error : EINTR); 554131691Salfred } 5557871Sdg if ((bp->b_flags & B_CACHE) == 0) { 55658345Sphk bp->b_iocmd = BIO_READ; 55732755Sdyson vfs_busy_pages(bp, 0); 558134898Sphk error = nfs_doio(vp, bp, cred, td); 55932755Sdyson if (error) { 56058934Sphk bp->b_ioflags |= BIO_ERROR; 56132755Sdyson brelse(bp); 56232755Sdyson return (error); 56332755Sdyson } 5641541Srgrimes } 5651541Srgrimes n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 5661541Srgrimes on = 0; 5671541Srgrimes break; 5681541Srgrimes case VDIR: 5691541Srgrimes nfsstats.biocache_readdirs++; 57024577Sdfr if (np->n_direofoffset 57124577Sdfr && uio->uio_offset >= np->n_direofoffset) { 57224577Sdfr return (0); 57324577Sdfr } 57436979Sbde lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 5759336Sdfr on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 57683366Sjulian bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 577131691Salfred if (!bp) { 578131691Salfred error = nfs_sigintr(nmp, NULL, td); 579131691Salfred return (error ? error : EINTR); 580131691Salfred } 5817871Sdg if ((bp->b_flags & B_CACHE) == 0) { 58258345Sphk bp->b_iocmd = BIO_READ; 5839336Sdfr vfs_busy_pages(bp, 0); 584134898Sphk error = nfs_doio(vp, bp, cred, td); 58532912Stegge if (error) { 58632912Stegge brelse(bp); 58732912Stegge } 58832755Sdyson while (error == NFSERR_BAD_COOKIE) { 589122953Salfred (nmp->nm_rpcops->nr_invaldir)(vp); 590140731Sphk error = nfs_vinvalbuf(vp, 0, td, 1); 59132755Sdyson /* 59232755Sdyson * Yuck! The directory has been modified on the 59332755Sdyson * server. The only way to get the block is by 59432755Sdyson * reading from the beginning to get all the 59532755Sdyson * offset cookies. 59646349Salc * 59746349Salc * Leave the last bp intact unless there is an error. 59846349Salc * Loop back up to the while if the error is another 59946349Salc * NFSERR_BAD_COOKIE (double yuch!). 60032755Sdyson */ 60132755Sdyson for (i = 0; i <= lbn && !error; i++) { 60232755Sdyson if (np->n_direofoffset 60332755Sdyson && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 60424577Sdfr return (0); 60583366Sjulian bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 606131691Salfred if (!bp) { 607131691Salfred error = nfs_sigintr(nmp, NULL, td); 608131691Salfred return (error ? error : EINTR); 609131691Salfred } 61046349Salc if ((bp->b_flags & B_CACHE) == 0) { 61158345Sphk bp->b_iocmd = BIO_READ; 61246349Salc vfs_busy_pages(bp, 0); 613134898Sphk error = nfs_doio(vp, bp, cred, td); 61446349Salc /* 61546349Salc * no error + B_INVAL == directory EOF, 61646349Salc * use the block. 61746349Salc */ 61846349Salc if (error == 0 && (bp->b_flags & B_INVAL)) 61946349Salc break; 62046349Salc } 62146349Salc /* 62246349Salc * An error will throw away the block and the 62346349Salc * for loop will break out. If no error and this 62446349Salc * is not the block we want, we throw away the 62546349Salc * block and go for the next one via the for loop. 62646349Salc */ 62746349Salc if (error || i < lbn) 62832755Sdyson brelse(bp); 6291541Srgrimes } 63032912Stegge } 63146349Salc /* 63246349Salc * The above while is repeated if we hit another cookie 63346349Salc * error. If we hit an error and it wasn't a cookie error, 63446349Salc * we give up. 63546349Salc */ 63632912Stegge if (error) 6379336Sdfr return (error); 6381541Srgrimes } 6391541Srgrimes 6401541Srgrimes /* 6411541Srgrimes * If not eof and read aheads are enabled, start one. 6421541Srgrimes * (You need the current block first, so that you have the 6439336Sdfr * directory offset cookie of the next block.) 6441541Srgrimes */ 64589324Speter if (nmp->nm_readahead > 0 && 64639782Smckusick (bp->b_flags & B_INVAL) == 0 && 6479336Sdfr (np->n_direofoffset == 0 || 6489336Sdfr (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 649136767Sphk incore(&vp->v_bufobj, lbn + 1) == NULL) { 65083366Sjulian rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 6511541Srgrimes if (rabp) { 6528692Sdg if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 65358345Sphk rabp->b_flags |= B_ASYNC; 65458345Sphk rabp->b_iocmd = BIO_READ; 6555455Sdg vfs_busy_pages(rabp, 0); 656134898Sphk if (nfs_asyncio(nmp, rabp, cred, td)) { 65758934Sphk rabp->b_flags |= B_INVAL; 65858934Sphk rabp->b_ioflags |= BIO_ERROR; 6595455Sdg vfs_unbusy_pages(rabp); 6601541Srgrimes brelse(rabp); 6611541Srgrimes } 6625471Sdg } else { 6635471Sdg brelse(rabp); 6641541Srgrimes } 6651541Srgrimes } 6661541Srgrimes } 66726469Sdfr /* 66846349Salc * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 66946349Salc * chopped for the EOF condition, we cannot tell how large 67046349Salc * NFS directories are going to be until we hit EOF. So 67146349Salc * an NFS directory buffer is *not* chopped to its EOF. Now, 67246349Salc * it just so happens that b_resid will effectively chop it 67346349Salc * to EOF. *BUT* this information is lost if the buffer goes 67446349Salc * away and is reconstituted into a B_CACHE state ( due to 67546349Salc * being VMIO ) later. So we keep track of the directory eof 67683651Speter * in np->n_direofoffset and chop it off as an extra step 67746349Salc * right here. 67826469Sdfr */ 67926469Sdfr n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 68046349Salc if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 68146349Salc n = np->n_direofoffset - uio->uio_offset; 6821541Srgrimes break; 6833305Sphk default: 68483651Speter printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 685143822Sdas bp = NULL; 6863305Sphk break; 6871541Srgrimes }; 6881541Srgrimes 6891541Srgrimes if (n > 0) { 69034206Sdyson error = uiomove(bp->b_data + on, (int)n, uio); 6911541Srgrimes } 692143822Sdas if (vp->v_type == VLNK) 6931541Srgrimes n = 0; 694143822Sdas if (bp != NULL) 695143822Sdas brelse(bp); 6961541Srgrimes } while (error == 0 && uio->uio_resid > 0 && n > 0); 6971541Srgrimes return (error); 6981541Srgrimes} 6991541Srgrimes 7001541Srgrimes/* 701138899Sps * The NFS write path cannot handle iovecs with len > 1. So we need to 702138899Sps * break up iovecs accordingly (restricting them to wsize). 703138899Sps * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). 704138899Sps * For the ASYNC case, 2 copies are needed. The first a copy from the 705138899Sps * user buffer to a staging buffer and then a second copy from the staging 706138899Sps * buffer to mbufs. This can be optimized by copying from the user buffer 707138899Sps * directly into mbufs and passing the chain down, but that requires a 708138899Sps * fair amount of re-working of the relevant codepaths (and can be done 709138899Sps * later). 710138899Sps */ 711138899Spsstatic int 712138899Spsnfs_directio_write(vp, uiop, cred, ioflag) 713138899Sps struct vnode *vp; 714138899Sps struct uio *uiop; 715138899Sps struct ucred *cred; 716138899Sps int ioflag; 717138899Sps{ 718138899Sps int error; 719138899Sps struct nfsmount *nmp = VFSTONFS(vp->v_mount); 720138899Sps struct thread *td = uiop->uio_td; 721138899Sps int size; 722138899Sps 723138899Sps if (ioflag & IO_SYNC) { 724138899Sps int iomode, must_commit; 725138899Sps struct uio uio; 726138899Sps struct iovec iov; 727138899Spsdo_sync: 728138899Sps while (uiop->uio_resid > 0) { 729138899Sps size = min(uiop->uio_resid, nmp->nm_wsize); 730138899Sps size = min(uiop->uio_iov->iov_len, size); 731138899Sps iov.iov_base = uiop->uio_iov->iov_base; 732138899Sps iov.iov_len = size; 733138899Sps uio.uio_iov = &iov; 734138899Sps uio.uio_iovcnt = 1; 735138899Sps uio.uio_offset = uiop->uio_offset; 736138899Sps uio.uio_resid = size; 737138899Sps uio.uio_segflg = UIO_USERSPACE; 738138899Sps uio.uio_rw = UIO_WRITE; 739138899Sps uio.uio_td = td; 740138899Sps iomode = NFSV3WRITE_FILESYNC; 741138899Sps error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, 742138899Sps &iomode, &must_commit); 743138899Sps KASSERT((must_commit == 0), 744138899Sps ("nfs_directio_write: Did not commit write")); 745138899Sps if (error) 746138899Sps return (error); 747138899Sps uiop->uio_offset += size; 748138899Sps uiop->uio_resid -= size; 749138899Sps if (uiop->uio_iov->iov_len <= size) { 750138899Sps uiop->uio_iovcnt--; 751138899Sps uiop->uio_iov++; 752138899Sps } else { 753138899Sps uiop->uio_iov->iov_base = 754138899Sps (char *)uiop->uio_iov->iov_base + size; 755138899Sps uiop->uio_iov->iov_len -= size; 756138899Sps } 757138899Sps } 758138899Sps } else { 759138899Sps struct uio *t_uio; 760138899Sps struct iovec *t_iov; 761138899Sps struct buf *bp; 762138899Sps 763138899Sps /* 764138899Sps * Break up the write into blocksize chunks and hand these 765138899Sps * over to nfsiod's for write back. 766138899Sps * Unfortunately, this incurs a copy of the data. Since 767138899Sps * the user could modify the buffer before the write is 768138899Sps * initiated. 769138899Sps * 770138899Sps * The obvious optimization here is that one of the 2 copies 771138899Sps * in the async write path can be eliminated by copying the 772138899Sps * data here directly into mbufs and passing the mbuf chain 773138899Sps * down. But that will require a fair amount of re-working 774138899Sps * of the code and can be done if there's enough interest 775138899Sps * in NFS directio access. 776138899Sps */ 777138899Sps while (uiop->uio_resid > 0) { 778138899Sps size = min(uiop->uio_resid, nmp->nm_wsize); 779138899Sps size = min(uiop->uio_iov->iov_len, size); 780138899Sps bp = getpbuf(&nfs_pbuf_freecnt); 781138899Sps t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); 782138899Sps t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); 783138899Sps t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); 784138899Sps t_iov->iov_len = size; 785138899Sps t_uio->uio_iov = t_iov; 786138899Sps t_uio->uio_iovcnt = 1; 787138899Sps t_uio->uio_offset = uiop->uio_offset; 788138899Sps t_uio->uio_resid = size; 789138899Sps t_uio->uio_segflg = UIO_SYSSPACE; 790138899Sps t_uio->uio_rw = UIO_WRITE; 791138899Sps t_uio->uio_td = td; 792138899Sps bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size); 793138899Sps bp->b_flags |= B_DIRECT; 794138899Sps bp->b_iocmd = BIO_WRITE; 795138899Sps if (cred != NOCRED) { 796138899Sps crhold(cred); 797138899Sps bp->b_wcred = cred; 798138899Sps } else 799138899Sps bp->b_wcred = NOCRED; 800138899Sps bp->b_caller1 = (void *)t_uio; 801138899Sps bp->b_vp = vp; 802138899Sps vhold(vp); 803138899Sps error = nfs_asyncio(nmp, bp, NOCRED, td); 804138899Sps if (error) { 805138899Sps free(t_iov->iov_base, M_NFSDIRECTIO); 806138899Sps free(t_iov, M_NFSDIRECTIO); 807138899Sps free(t_uio, M_NFSDIRECTIO); 808138899Sps vdrop(bp->b_vp); 809138899Sps bp->b_vp = NULL; 810138899Sps relpbuf(bp, &nfs_pbuf_freecnt); 811138899Sps if (error == EINTR) 812138899Sps return (error); 813138899Sps goto do_sync; 814138899Sps } 815138899Sps uiop->uio_offset += size; 816138899Sps uiop->uio_resid -= size; 817138899Sps if (uiop->uio_iov->iov_len <= size) { 818138899Sps uiop->uio_iovcnt--; 819138899Sps uiop->uio_iov++; 820138899Sps } else { 821138899Sps uiop->uio_iov->iov_base = 822138899Sps (char *)uiop->uio_iov->iov_base + size; 823138899Sps uiop->uio_iov->iov_len -= size; 824138899Sps } 825138899Sps } 826138899Sps } 827138899Sps return (0); 828138899Sps} 829138899Sps 830138899Sps/* 8311541Srgrimes * Vnode op for write using bio 8321541Srgrimes */ 8331549Srgrimesint 83483651Speternfs_write(struct vop_write_args *ap) 8351541Srgrimes{ 83646349Salc int biosize; 83746349Salc struct uio *uio = ap->a_uio; 83883366Sjulian struct thread *td = uio->uio_td; 83946349Salc struct vnode *vp = ap->a_vp; 8401541Srgrimes struct nfsnode *np = VTONFS(vp); 84146349Salc struct ucred *cred = ap->a_cred; 8421541Srgrimes int ioflag = ap->a_ioflag; 8431541Srgrimes struct buf *bp; 8441541Srgrimes struct vattr vattr; 8459336Sdfr struct nfsmount *nmp = VFSTONFS(vp->v_mount); 84611921Sphk daddr_t lbn; 84746349Salc int bcount; 84883651Speter int n, on, error = 0; 84954605Sdillon int haverslock = 0; 85083366Sjulian struct proc *p = td?td->td_proc:NULL; 8511541Srgrimes 85279224Sdillon GIANT_REQUIRED; 85379224Sdillon 8541541Srgrimes#ifdef DIAGNOSTIC 8551541Srgrimes if (uio->uio_rw != UIO_WRITE) 8561541Srgrimes panic("nfs_write mode"); 85783366Sjulian if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) 8581541Srgrimes panic("nfs_write proc"); 8591541Srgrimes#endif 8601541Srgrimes if (vp->v_type != VREG) 8611541Srgrimes return (EIO); 8621541Srgrimes if (np->n_flag & NWRITEERR) { 8631541Srgrimes np->n_flag &= ~NWRITEERR; 8641541Srgrimes return (np->n_error); 8651541Srgrimes } 86636176Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 86736176Speter (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 86883366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 86954605Sdillon 87054605Sdillon /* 87154605Sdillon * Synchronously flush pending buffers if we are in synchronous 87254605Sdillon * mode or if we are appending. 87354605Sdillon */ 8741541Srgrimes if (ioflag & (IO_APPEND | IO_SYNC)) { 8751541Srgrimes if (np->n_flag & NMODIFIED) { 876147420Sgreen#ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */ 877147280Sgreen /* 878147280Sgreen * Require non-blocking, synchronous writes to 879147280Sgreen * dirty files to inform the program it needs 880147280Sgreen * to fsync(2) explicitly. 881147280Sgreen */ 882147280Sgreen if (ioflag & IO_NDELAY) 883147280Sgreen return (EAGAIN); 884147420Sgreen#endif 885147280Sgreenflush_and_restart: 8861541Srgrimes np->n_attrstamp = 0; 887140731Sphk error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 8883305Sphk if (error) 8891541Srgrimes return (error); 8901541Srgrimes } 8911541Srgrimes } 89254605Sdillon 89354605Sdillon /* 89454605Sdillon * If IO_APPEND then load uio_offset. We restart here if we cannot 89554605Sdillon * get the append lock. 89654605Sdillon */ 89754605Sdillonrestart: 89854605Sdillon if (ioflag & IO_APPEND) { 89954605Sdillon np->n_attrstamp = 0; 90083366Sjulian error = VOP_GETATTR(vp, &vattr, cred, td); 90154605Sdillon if (error) 90254605Sdillon return (error); 90354605Sdillon uio->uio_offset = np->n_size; 90454605Sdillon } 90554605Sdillon 9061541Srgrimes if (uio->uio_offset < 0) 9071541Srgrimes return (EINVAL); 90836473Speter if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 90936473Speter return (EFBIG); 9101541Srgrimes if (uio->uio_resid == 0) 9111541Srgrimes return (0); 91254605Sdillon 913138899Sps if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG) 914138899Sps return nfs_directio_write(vp, uio, cred, ioflag); 915138899Sps 9161541Srgrimes /* 91754605Sdillon * We need to obtain the rslock if we intend to modify np->n_size 91854605Sdillon * in order to guarentee the append point with multiple contending 91954605Sdillon * writers, to guarentee that no other appenders modify n_size 92054605Sdillon * while we are trying to obtain a truncated buffer (i.e. to avoid 92154605Sdillon * accidently truncating data written by another appender due to 92254605Sdillon * the race), and to ensure that the buffer is populated prior to 92354605Sdillon * our extending of the file. We hold rslock through the entire 92454605Sdillon * operation. 92554605Sdillon * 92654605Sdillon * Note that we do not synchronize the case where someone truncates 92754605Sdillon * the file while we are appending to it because attempting to lock 92854605Sdillon * this case may deadlock other parts of the system unexpectedly. 92954605Sdillon */ 93054605Sdillon if ((ioflag & IO_APPEND) || 93154605Sdillon uio->uio_offset + uio->uio_resid > np->n_size) { 93283366Sjulian switch(nfs_rslock(np, td)) { 93355431Sdillon case ENOLCK: 93454605Sdillon goto restart; 93555431Sdillon /* not reached */ 936131691Salfred case EIO: 937131691Salfred return (EIO); 93855431Sdillon case EINTR: 93955431Sdillon case ERESTART: 94055431Sdillon return(EINTR); 94155431Sdillon /* not reached */ 94255431Sdillon default: 94355431Sdillon break; 94455431Sdillon } 94554605Sdillon haverslock = 1; 94654605Sdillon } 94754605Sdillon 94854605Sdillon /* 9491541Srgrimes * Maybe this should be above the vnode op call, but so long as 9501541Srgrimes * file servers have no limits, i don't think it matters 9511541Srgrimes */ 952125454Sjhb if (p != NULL) { 95373929Sjhb PROC_LOCK(p); 954125454Sjhb if (uio->uio_offset + uio->uio_resid > 955125454Sjhb lim_cur(p, RLIMIT_FSIZE)) { 956125454Sjhb psignal(p, SIGXFSZ); 957125454Sjhb PROC_UNLOCK(p); 958125454Sjhb if (haverslock) 959125454Sjhb nfs_rsunlock(np, td); 960125454Sjhb return (EFBIG); 961125454Sjhb } 96273929Sjhb PROC_UNLOCK(p); 9631541Srgrimes } 96446349Salc 9659428Sdfr biosize = vp->v_mount->mnt_stat.f_iosize; 966147280Sgreen /* 967147280Sgreen * Find all of this file's B_NEEDCOMMIT buffers. If our writes 968147280Sgreen * would exceed the local maximum per-file write commit size when 969147280Sgreen * combined with those, we must decide whether to flush, 970147280Sgreen * go synchronous, or return error. We don't bother checking 971147280Sgreen * IO_UNIT -- we just make all writes atomic anyway, as there's 972147280Sgreen * no point optimizing for something that really won't ever happen. 973147280Sgreen */ 974147280Sgreen if (!(ioflag & IO_SYNC)) { 975147280Sgreen int needrestart = 0; 976147280Sgreen if (nmp->nm_wcommitsize < uio->uio_resid) { 977147280Sgreen /* 978147280Sgreen * If this request could not possibly be completed 979147280Sgreen * without exceeding the maximum outstanding write 980147280Sgreen * commit size, see if we can convert it into a 981147280Sgreen * synchronous write operation. 982147280Sgreen */ 983147280Sgreen if (ioflag & IO_NDELAY) 984147280Sgreen return (EAGAIN); 985147280Sgreen ioflag |= IO_SYNC; 986147280Sgreen if (np->n_flag & NMODIFIED) 987147280Sgreen needrestart = 1; 988147280Sgreen } else if (np->n_flag & NMODIFIED) { 989147280Sgreen int wouldcommit = 0; 990147280Sgreen BO_LOCK(&vp->v_bufobj); 991147280Sgreen if (vp->v_bufobj.bo_dirty.bv_cnt != 0) { 992147280Sgreen TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, 993147280Sgreen b_bobufs) { 994147280Sgreen if (bp->b_flags & B_NEEDCOMMIT) 995147280Sgreen wouldcommit += bp->b_bcount; 996147280Sgreen } 997147280Sgreen } 998147280Sgreen BO_UNLOCK(&vp->v_bufobj); 999147280Sgreen /* 1000147280Sgreen * Since we're not operating synchronously and 1001147280Sgreen * bypassing the buffer cache, we are in a commit 1002147280Sgreen * and holding all of these buffers whether 1003147280Sgreen * transmitted or not. If not limited, this 1004147280Sgreen * will lead to the buffer cache deadlocking, 1005147280Sgreen * as no one else can flush our uncommitted buffers. 1006147280Sgreen */ 1007147280Sgreen wouldcommit += uio->uio_resid; 1008147280Sgreen /* 1009147280Sgreen * If we would initially exceed the maximum 1010147280Sgreen * outstanding write commit size, flush and restart. 1011147280Sgreen */ 1012147280Sgreen if (wouldcommit > nmp->nm_wcommitsize) 1013147280Sgreen needrestart = 1; 1014147280Sgreen } 1015147280Sgreen if (needrestart) { 1016147280Sgreen if (haverslock) { 1017147280Sgreen nfs_rsunlock(np, td); 1018147280Sgreen haverslock = 0; 1019147280Sgreen } 1020147280Sgreen goto flush_and_restart; 1021147280Sgreen } 1022147280Sgreen } 102346349Salc 10241541Srgrimes do { 10251541Srgrimes nfsstats.biocache_writes++; 10261541Srgrimes lbn = uio->uio_offset / biosize; 10271541Srgrimes on = uio->uio_offset & (biosize-1); 10281541Srgrimes n = min((unsigned)(biosize - on), uio->uio_resid); 10291541Srgrimesagain: 103046349Salc /* 103146349Salc * Handle direct append and file extension cases, calculate 103246349Salc * unaligned buffer size. 103346349Salc */ 103446349Salc 103546349Salc if (uio->uio_offset == np->n_size && n) { 103646349Salc /* 103754605Sdillon * Get the buffer (in its pre-append state to maintain 103854605Sdillon * B_CACHE if it was previously set). Resize the 103954605Sdillon * nfsnode after we have locked the buffer to prevent 104054605Sdillon * readers from reading garbage. 104146349Salc */ 104246349Salc bcount = on; 104383366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 104446349Salc 104554605Sdillon if (bp != NULL) { 104654605Sdillon long save; 104746349Salc 104854605Sdillon np->n_size = uio->uio_offset + n; 104954605Sdillon np->n_flag |= NMODIFIED; 105054605Sdillon vnode_pager_setsize(vp, np->n_size); 105154605Sdillon 105254605Sdillon save = bp->b_flags & B_CACHE; 105354605Sdillon bcount += n; 105454605Sdillon allocbuf(bp, bcount); 105554605Sdillon bp->b_flags |= save; 105654605Sdillon } 105746349Salc } else { 105854605Sdillon /* 105983651Speter * Obtain the locked cache block first, and then 106054605Sdillon * adjust the file's size as appropriate. 106154605Sdillon */ 106254605Sdillon bcount = on + n; 106354605Sdillon if ((off_t)lbn * biosize + bcount < np->n_size) { 106454605Sdillon if ((off_t)(lbn + 1) * biosize < np->n_size) 106554605Sdillon bcount = biosize; 106654605Sdillon else 106754605Sdillon bcount = np->n_size - (off_t)lbn * biosize; 106854605Sdillon } 106983366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 107046349Salc if (uio->uio_offset + n > np->n_size) { 107146349Salc np->n_size = uio->uio_offset + n; 107246349Salc np->n_flag |= NMODIFIED; 107346349Salc vnode_pager_setsize(vp, np->n_size); 107446349Salc } 10758692Sdg } 107646349Salc 107754605Sdillon if (!bp) { 1078131691Salfred error = nfs_sigintr(nmp, NULL, td); 1079131691Salfred if (!error) 1080131691Salfred error = EINTR; 108154605Sdillon break; 108254605Sdillon } 108354605Sdillon 108446349Salc /* 108546349Salc * Issue a READ if B_CACHE is not set. In special-append 108646349Salc * mode, B_CACHE is based on the buffer prior to the write 108746349Salc * op and is typically set, avoiding the read. If a read 108846349Salc * is required in special append mode, the server will 108946349Salc * probably send us a short-read since we extended the file 109083651Speter * on our end, resulting in b_resid == 0 and, thusly, 109146349Salc * B_CACHE getting set. 109246349Salc * 109346349Salc * We can also avoid issuing the read if the write covers 109446349Salc * the entire buffer. We have to make sure the buffer state 109546349Salc * is reasonable in this case since we will not be initiating 109646349Salc * I/O. See the comments in kern/vfs_bio.c's getblk() for 109746349Salc * more information. 109846349Salc * 109946349Salc * B_CACHE may also be set due to the buffer being cached 110046349Salc * normally. 110146349Salc */ 110246349Salc 110346349Salc if (on == 0 && n == bcount) { 110446349Salc bp->b_flags |= B_CACHE; 110558934Sphk bp->b_flags &= ~B_INVAL; 110658934Sphk bp->b_ioflags &= ~BIO_ERROR; 11078692Sdg } 110846349Salc 110946349Salc if ((bp->b_flags & B_CACHE) == 0) { 111058345Sphk bp->b_iocmd = BIO_READ; 111146349Salc vfs_busy_pages(bp, 0); 1112134898Sphk error = nfs_doio(vp, bp, cred, td); 111346349Salc if (error) { 111446349Salc brelse(bp); 111554605Sdillon break; 111646349Salc } 111746349Salc } 111884827Sjhb if (bp->b_wcred == NOCRED) 111984827Sjhb bp->b_wcred = crhold(cred); 11201541Srgrimes np->n_flag |= NMODIFIED; 11218692Sdg 112245347Sjulian /* 112354605Sdillon * If dirtyend exceeds file size, chop it down. This should 112454605Sdillon * not normally occur but there is an append race where it 112583651Speter * might occur XXX, so we log it. 112654605Sdillon * 112754605Sdillon * If the chopping creates a reverse-indexed or degenerate 112854605Sdillon * situation with dirtyoff/end, we 0 both of them. 112945347Sjulian */ 113045347Sjulian 113154605Sdillon if (bp->b_dirtyend > bcount) { 113283651Speter printf("NFS append race @%lx:%d\n", 113383651Speter (long)bp->b_blkno * DEV_BSIZE, 113454605Sdillon bp->b_dirtyend - bcount); 113554605Sdillon bp->b_dirtyend = bcount; 113654605Sdillon } 113754605Sdillon 113845347Sjulian if (bp->b_dirtyoff >= bp->b_dirtyend) 113945347Sjulian bp->b_dirtyoff = bp->b_dirtyend = 0; 114031617Sdyson 11411541Srgrimes /* 114231617Sdyson * If the new write will leave a contiguous dirty 114331617Sdyson * area, just update the b_dirtyoff and b_dirtyend, 114431617Sdyson * otherwise force a write rpc of the old dirty area. 114546349Salc * 114683651Speter * While it is possible to merge discontiguous writes due to 114746349Salc * our having a B_CACHE buffer ( and thus valid read data 114883651Speter * for the hole), we don't because it could lead to 114946349Salc * significant cache coherency problems with multiple clients, 115046349Salc * especially if locking is implemented later on. 115146349Salc * 115246349Salc * as an optimization we could theoretically maintain 115346349Salc * a linked list of discontinuous areas, but we would still 115446349Salc * have to commit them separately so there isn't much 115546349Salc * advantage to it except perhaps a bit of asynchronization. 115631617Sdyson */ 115742957Sdillon 115831617Sdyson if (bp->b_dirtyend > 0 && 115931617Sdyson (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1160126853Sphk if (bwrite(bp) == EINTR) { 1161100194Sdillon error = EINTR; 1162100194Sdillon break; 1163100194Sdillon } 116431617Sdyson goto again; 116531617Sdyson } 116631617Sdyson 11673305Sphk error = uiomove((char *)bp->b_data + on, n, uio); 116854480Sdillon 116954480Sdillon /* 117054480Sdillon * Since this block is being modified, it must be written 117154480Sdillon * again and not just committed. Since write clustering does 117254480Sdillon * not work for the stage 1 data write, only the stage 2 117354480Sdillon * commit rpc, we have to clear B_CLUSTEROK as well. 117454480Sdillon */ 117554480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 117654480Sdillon 11773305Sphk if (error) { 117858934Sphk bp->b_ioflags |= BIO_ERROR; 11791541Srgrimes brelse(bp); 118054605Sdillon break; 11811541Srgrimes } 118234206Sdyson 118334206Sdyson /* 118483651Speter * Only update dirtyoff/dirtyend if not a degenerate 118545347Sjulian * condition. 118645347Sjulian */ 118745347Sjulian if (n) { 118845347Sjulian if (bp->b_dirtyend > 0) { 118945347Sjulian bp->b_dirtyoff = min(on, bp->b_dirtyoff); 119045347Sjulian bp->b_dirtyend = max((on + n), bp->b_dirtyend); 119145347Sjulian } else { 119245347Sjulian bp->b_dirtyoff = on; 119345347Sjulian bp->b_dirtyend = on + n; 119445347Sjulian } 119546349Salc vfs_bio_set_validclean(bp, on, n); 11961541Srgrimes } 119745347Sjulian 119844679Sjulian /* 119983651Speter * If IO_SYNC do bwrite(). 120046349Salc * 120146349Salc * IO_INVAL appears to be unused. The idea appears to be 120246349Salc * to turn off caching in this case. Very odd. XXX 12031541Srgrimes */ 120483651Speter if ((ioflag & IO_SYNC)) { 120534206Sdyson if (ioflag & IO_INVAL) 120646349Salc bp->b_flags |= B_NOCACHE; 1207126853Sphk error = bwrite(bp); 12083305Sphk if (error) 120954605Sdillon break; 121083651Speter } else if ((n + on) == biosize) { 12119336Sdfr bp->b_flags |= B_ASYNC; 1212122953Salfred (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, 0); 121346349Salc } else { 12141541Srgrimes bdwrite(bp); 121546349Salc } 12161541Srgrimes } while (uio->uio_resid > 0 && n > 0); 121754605Sdillon 121854605Sdillon if (haverslock) 121983366Sjulian nfs_rsunlock(np, td); 122054605Sdillon 122154605Sdillon return (error); 12221541Srgrimes} 12231541Srgrimes 12241541Srgrimes/* 12251541Srgrimes * Get an nfs cache block. 122654480Sdillon * 12271541Srgrimes * Allocate a new one if the block isn't currently in the cache 12281541Srgrimes * and return the block marked busy. If the calling process is 12291541Srgrimes * interrupted by a signal for an interruptible mount point, return 12301541Srgrimes * NULL. 123154480Sdillon * 123254480Sdillon * The caller must carefully deal with the possible B_INVAL state of 123354480Sdillon * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 123454480Sdillon * indirectly), so synchronous reads can be issued without worrying about 123554480Sdillon * the B_INVAL state. We have to be a little more careful when dealing 123654480Sdillon * with writes (see comments in nfs_write()) when extending a file past 123754480Sdillon * its EOF. 12381541Srgrimes */ 123912911Sphkstatic struct buf * 124083651Speternfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) 12411541Srgrimes{ 124283651Speter struct buf *bp; 124332755Sdyson struct mount *mp; 124432755Sdyson struct nfsmount *nmp; 12451541Srgrimes 124632755Sdyson mp = vp->v_mount; 124732755Sdyson nmp = VFSTONFS(mp); 124832755Sdyson 12491541Srgrimes if (nmp->nm_flag & NFSMNT_INT) { 1250138496Sps sigset_t oldset; 1251138496Sps 1252138496Sps nfs_set_sigmask(td, &oldset); 1253111856Sjeff bp = getblk(vp, bn, size, PCATCH, 0, 0); 1254138496Sps nfs_restore_sigmask(td, &oldset); 125599797Sdillon while (bp == NULL) { 125699797Sdillon if (nfs_sigintr(nmp, NULL, td)) 125799797Sdillon return (NULL); 1258111856Sjeff bp = getblk(vp, bn, size, 0, 2 * hz, 0); 12591541Srgrimes } 126046349Salc } else { 1261111856Sjeff bp = getblk(vp, bn, size, 0, 0, 0); 126246349Salc } 12635455Sdg 126441791Sdt if (vp->v_type == VREG) { 126532755Sdyson int biosize; 126646349Salc 126732755Sdyson biosize = mp->mnt_stat.f_iosize; 126841791Sdt bp->b_blkno = bn * (biosize / DEV_BSIZE); 126932755Sdyson } 12701541Srgrimes return (bp); 12711541Srgrimes} 12721541Srgrimes 12731541Srgrimes/* 12741541Srgrimes * Flush and invalidate all dirty buffers. If another process is already 12751541Srgrimes * doing the flush, just wait for completion. 12761541Srgrimes */ 12771549Srgrimesint 1278140731Sphknfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg) 12791541Srgrimes{ 128083651Speter struct nfsnode *np = VTONFS(vp); 12811541Srgrimes struct nfsmount *nmp = VFSTONFS(vp->v_mount); 12821541Srgrimes int error = 0, slpflag, slptimeo; 1283138469Sps int old_lock = 0; 12841541Srgrimes 1285115041Srwatson ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf"); 1286115041Srwatson 1287120730Sjeff /* 1288120730Sjeff * XXX This check stops us from needlessly doing a vinvalbuf when 1289120730Sjeff * being called through vclean(). It is not clear that this is 1290120730Sjeff * unsafe. 1291120730Sjeff */ 1292143510Sjeff if (vp->v_iflag & VI_DOOMED) 129332755Sdyson return (0); 129432755Sdyson 12951541Srgrimes if ((nmp->nm_flag & NFSMNT_INT) == 0) 12961541Srgrimes intrflg = 0; 12971541Srgrimes if (intrflg) { 12981541Srgrimes slpflag = PCATCH; 12991541Srgrimes slptimeo = 2 * hz; 13001541Srgrimes } else { 13011541Srgrimes slpflag = 0; 13021541Srgrimes slptimeo = 0; 13031541Srgrimes } 13041541Srgrimes 1305138469Sps if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) { 1306138469Sps if (old_lock == LK_SHARED) { 1307138469Sps /* Upgrade to exclusive lock, this might block */ 1308138469Sps vn_lock(vp, LK_UPGRADE | LK_RETRY, td); 1309138469Sps } else { 1310138469Sps vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1311138469Sps } 1312138469Sps } 1313138469Sps 13141541Srgrimes /* 13151541Srgrimes * Now, flush as required. 13161541Srgrimes */ 1317140220Sphk error = vinvalbuf(vp, flags, td, slpflag, 0); 13181541Srgrimes while (error) { 1319138469Sps if (intrflg && (error = nfs_sigintr(nmp, NULL, td))) 1320138469Sps goto out; 1321140220Sphk error = vinvalbuf(vp, flags, td, 0, slptimeo); 13221541Srgrimes } 1323138469Sps np->n_flag &= ~NMODIFIED; 1324138469Spsout: 1325138469Sps if (old_lock != LK_EXCLUSIVE) { 1326138469Sps if (old_lock == LK_SHARED) { 1327138469Sps /* Downgrade from exclusive lock, this might block */ 1328138469Sps vn_lock(vp, LK_DOWNGRADE, td); 1329138469Sps } else { 1330138469Sps VOP_UNLOCK(vp, 0, td); 1331138469Sps } 1332138469Sps } 1333138469Sps return error; 13341541Srgrimes} 13351541Srgrimes 13361541Srgrimes/* 13371541Srgrimes * Initiate asynchronous I/O. Return an error if no nfsiods are available. 13381541Srgrimes * This is mainly to avoid queueing async I/O requests when the nfsiods 13391541Srgrimes * are all hung on a dead server. 134046349Salc * 134158934Sphk * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 134246349Salc * is eventually dequeued by the async daemon, nfs_doio() *will*. 13431541Srgrimes */ 13441549Srgrimesint 1345134898Sphknfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td) 13461541Srgrimes{ 134789324Speter int iod; 134819449Sdfr int gotiod; 134919449Sdfr int slpflag = 0; 135019449Sdfr int slptimeo = 0; 1351131691Salfred int error, error2; 13521541Srgrimes 135355431Sdillon /* 135483651Speter * Commits are usually short and sweet so lets save some cpu and 135555431Sdillon * leave the async daemons for more important rpc's (such as reads 135655431Sdillon * and writes). 135755431Sdillon */ 135858345Sphk if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 135955431Sdillon (nmp->nm_bufqiods > nfs_numasync / 2)) { 136055431Sdillon return(EIO); 136155431Sdillon } 136255431Sdillon 136319449Sdfragain: 136419449Sdfr if (nmp->nm_flag & NFSMNT_INT) 136519449Sdfr slpflag = PCATCH; 136619449Sdfr gotiod = FALSE; 136719449Sdfr 136819449Sdfr /* 136919449Sdfr * Find a free iod to process this request. 137019449Sdfr */ 137189407Speter for (iod = 0; iod < nfs_numasync; iod++) 137289324Speter if (nfs_iodwant[iod]) { 137319449Sdfr gotiod = TRUE; 137425023Sdfr break; 137519449Sdfr } 137619449Sdfr 137719449Sdfr /* 137889324Speter * Try to create one if none are free. 137989324Speter */ 138089324Speter if (!gotiod) { 138189324Speter iod = nfs_nfsiodnew(); 138289324Speter if (iod != -1) 138389324Speter gotiod = TRUE; 138489324Speter } 138589324Speter 138689407Speter if (gotiod) { 138789407Speter /* 138889407Speter * Found one, so wake it up and tell it which 138989407Speter * mount to process. 139089407Speter */ 139189407Speter NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", 139289407Speter iod, nmp)); 139399797Sdillon nfs_iodwant[iod] = NULL; 139489407Speter nfs_iodmount[iod] = nmp; 139589407Speter nmp->nm_bufqiods++; 1396111748Sdes wakeup(&nfs_iodwant[iod]); 139789407Speter } 139889407Speter 139989324Speter /* 140019449Sdfr * If none are free, we may already have an iod working on this mount 140119449Sdfr * point. If so, it will process our request. 140219449Sdfr */ 140319449Sdfr if (!gotiod) { 140419449Sdfr if (nmp->nm_bufqiods > 0) { 140519449Sdfr NFS_DPF(ASYNCIO, 140619449Sdfr ("nfs_asyncio: %d iods are already processing mount %p\n", 140719449Sdfr nmp->nm_bufqiods, nmp)); 140819449Sdfr gotiod = TRUE; 140919449Sdfr } 141019449Sdfr } 141119449Sdfr 141219449Sdfr /* 141319449Sdfr * If we have an iod which can process the request, then queue 141419449Sdfr * the buffer. 141519449Sdfr */ 141619449Sdfr if (gotiod) { 141719449Sdfr /* 141855431Sdillon * Ensure that the queue never grows too large. We still want 141955431Sdillon * to asynchronize so we block rather then return EIO. 142019449Sdfr */ 142119449Sdfr while (nmp->nm_bufqlen >= 2*nfs_numasync) { 142219449Sdfr NFS_DPF(ASYNCIO, 142319449Sdfr ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 142419449Sdfr nmp->nm_bufqwant = TRUE; 1425138496Sps error = nfs_tsleep(td, &nmp->nm_bufq, slpflag | PRIBIO, 1426138496Sps "nfsaio", slptimeo); 142719449Sdfr if (error) { 1428131691Salfred error2 = nfs_sigintr(nmp, NULL, td); 1429131691Salfred if (error2) 1430131691Salfred return (error2); 143119449Sdfr if (slpflag == PCATCH) { 143219449Sdfr slpflag = 0; 143319449Sdfr slptimeo = 2 * hz; 143419449Sdfr } 143519449Sdfr } 143619449Sdfr /* 143719449Sdfr * We might have lost our iod while sleeping, 143819449Sdfr * so check and loop if nescessary. 143919449Sdfr */ 144019449Sdfr if (nmp->nm_bufqiods == 0) { 144119449Sdfr NFS_DPF(ASYNCIO, 144219449Sdfr ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 144319449Sdfr goto again; 144419449Sdfr } 144519449Sdfr } 144619449Sdfr 144758345Sphk if (bp->b_iocmd == BIO_READ) { 144884827Sjhb if (bp->b_rcred == NOCRED && cred != NOCRED) 144984827Sjhb bp->b_rcred = crhold(cred); 14501541Srgrimes } else { 145184827Sjhb if (bp->b_wcred == NOCRED && cred != NOCRED) 145284827Sjhb bp->b_wcred = crhold(cred); 14531541Srgrimes } 14548876Srgrimes 1455137846Sjeff if (bp->b_flags & B_REMFREE) 1456137846Sjeff bremfreef(bp); 145748225Smckusick BUF_KERNPROC(bp); 145819449Sdfr TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 145919449Sdfr nmp->nm_bufqlen++; 14601541Srgrimes return (0); 146119449Sdfr } 14629336Sdfr 14639336Sdfr /* 146419449Sdfr * All the iods are busy on other mounts, so return EIO to 146519449Sdfr * force the caller to process the i/o synchronously. 14669336Sdfr */ 146719449Sdfr NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 146819449Sdfr return (EIO); 14691541Srgrimes} 14701541Srgrimes 1471138899Spsvoid 1472138899Spsnfs_doio_directwrite(struct buf *bp) 1473138899Sps{ 1474138899Sps int iomode, must_commit; 1475138899Sps struct uio *uiop = (struct uio *)bp->b_caller1; 1476138899Sps char *iov_base = uiop->uio_iov->iov_base; 1477138899Sps struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount); 1478138899Sps 1479138899Sps iomode = NFSV3WRITE_FILESYNC; 1480138899Sps uiop->uio_td = NULL; /* NULL since we're in nfsiod */ 1481138899Sps (nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit); 1482138899Sps KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write")); 1483138899Sps free(iov_base, M_NFSDIRECTIO); 1484138899Sps free(uiop->uio_iov, M_NFSDIRECTIO); 1485138899Sps free(uiop, M_NFSDIRECTIO); 1486138899Sps vdrop(bp->b_vp); 1487138899Sps bp->b_vp = NULL; 1488138899Sps relpbuf(bp, &nfs_pbuf_freecnt); 1489138899Sps} 1490138899Sps 14911541Srgrimes/* 14921541Srgrimes * Do an I/O operation to/from a cache block. This may be called 14931541Srgrimes * synchronously or from an nfsiod. 14941541Srgrimes */ 14951541Srgrimesint 1496134898Sphknfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) 14971541Srgrimes{ 149844679Sjulian struct uio *uiop; 14991541Srgrimes struct nfsnode *np; 15001541Srgrimes struct nfsmount *nmp; 150146349Salc int error = 0, iomode, must_commit = 0; 15021541Srgrimes struct uio uio; 15031541Srgrimes struct iovec io; 150483651Speter struct proc *p = td ? td->td_proc : NULL; 15051541Srgrimes 15061541Srgrimes np = VTONFS(vp); 15071541Srgrimes nmp = VFSTONFS(vp->v_mount); 15081541Srgrimes uiop = &uio; 15091541Srgrimes uiop->uio_iov = &io; 15101541Srgrimes uiop->uio_iovcnt = 1; 15111541Srgrimes uiop->uio_segflg = UIO_SYSSPACE; 151283366Sjulian uiop->uio_td = td; 15131541Srgrimes 151446349Salc /* 151558934Sphk * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 151646349Salc * do this here so we do not have to do it in all the code that 151746349Salc * calls us. 151846349Salc */ 151958934Sphk bp->b_flags &= ~B_INVAL; 152058934Sphk bp->b_ioflags &= ~BIO_ERROR; 152146349Salc 152244679Sjulian KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 152344679Sjulian 1524121191Sphk if (bp->b_iocmd == BIO_READ) { 15253664Sphk io.iov_len = uiop->uio_resid = bp->b_bcount; 15263664Sphk io.iov_base = bp->b_data; 15271541Srgrimes uiop->uio_rw = UIO_READ; 152887834Sdillon 15291541Srgrimes switch (vp->v_type) { 15301541Srgrimes case VREG: 15319336Sdfr uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 15321541Srgrimes nfsstats.read_bios++; 1533122953Salfred error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr); 153487834Sdillon 15351541Srgrimes if (!error) { 15361541Srgrimes if (uiop->uio_resid) { 15371541Srgrimes /* 153846349Salc * If we had a short read with no error, we must have 153946349Salc * hit a file hole. We should zero-fill the remainder. 154046349Salc * This can also occur if the server hits the file EOF. 154146349Salc * 154283651Speter * Holes used to be able to occur due to pending 154346349Salc * writes, but that is not possible any longer. 15441541Srgrimes */ 154546349Salc int nread = bp->b_bcount - uiop->uio_resid; 154687834Sdillon int left = uiop->uio_resid; 154746349Salc 154846349Salc if (left > 0) 154946349Salc bzero((char *)bp->b_data + nread, left); 155046349Salc uiop->uio_resid = 0; 155146349Salc } 15521541Srgrimes } 1553115041Srwatson /* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */ 1554101308Sjeff if (p && (vp->v_vflag & VV_TEXT) && 1555138473Sps (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime))) { 155673929Sjhb PROC_LOCK(p); 1557136006Sdas killproc(p, "text file modification"); 155873929Sjhb PROC_UNLOCK(p); 15591541Srgrimes } 15601541Srgrimes break; 15611541Srgrimes case VLNK: 15629336Sdfr uiop->uio_offset = (off_t)0; 15631541Srgrimes nfsstats.readlink_bios++; 1564122953Salfred error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr); 15651541Srgrimes break; 15661541Srgrimes case VDIR: 15671541Srgrimes nfsstats.readdir_bios++; 15689336Sdfr uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1569122698Salfred if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) 1570122698Salfred error = nfs4_readdirrpc(vp, uiop, cr); 1571122698Salfred else { 1572122698Salfred if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { 1573122698Salfred error = nfs_readdirplusrpc(vp, uiop, cr); 1574122698Salfred if (error == NFSERR_NOTSUPP) 1575122698Salfred nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1576122698Salfred } 1577122698Salfred if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1578122698Salfred error = nfs_readdirrpc(vp, uiop, cr); 15799336Sdfr } 158046349Salc /* 158146349Salc * end-of-directory sets B_INVAL but does not generate an 158246349Salc * error. 158346349Salc */ 158439782Smckusick if (error == 0 && uiop->uio_resid == bp->b_bcount) 158539782Smckusick bp->b_flags |= B_INVAL; 15861541Srgrimes break; 15873305Sphk default: 158883651Speter printf("nfs_doio: type %x unexpected\n", vp->v_type); 15893305Sphk break; 15901541Srgrimes }; 15911541Srgrimes if (error) { 159258934Sphk bp->b_ioflags |= BIO_ERROR; 15931541Srgrimes bp->b_error = error; 15941541Srgrimes } 15951541Srgrimes } else { 159683651Speter /* 159751344Sdillon * If we only need to commit, try to commit 159851344Sdillon */ 159951344Sdillon if (bp->b_flags & B_NEEDCOMMIT) { 160051344Sdillon int retv; 160151344Sdillon off_t off; 160251344Sdillon 160351344Sdillon off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1604122953Salfred retv = (nmp->nm_rpcops->nr_commit)( 1605136927Sphk vp, off, bp->b_dirtyend-bp->b_dirtyoff, 160683366Sjulian bp->b_wcred, td); 160751344Sdillon if (retv == 0) { 160851344Sdillon bp->b_dirtyoff = bp->b_dirtyend = 0; 160954480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 161051344Sdillon bp->b_resid = 0; 161159249Sphk bufdone(bp); 161251344Sdillon return (0); 161351344Sdillon } 161451344Sdillon if (retv == NFSERR_STALEWRITEVERF) { 1615136927Sphk nfs_clearcommit(vp->v_mount); 161651344Sdillon } 161751344Sdillon } 161851344Sdillon 161951344Sdillon /* 162051344Sdillon * Setup for actual write 162151344Sdillon */ 162251344Sdillon 162341791Sdt if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 162441791Sdt bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 16258692Sdg 16268692Sdg if (bp->b_dirtyend > bp->b_dirtyoff) { 16278692Sdg io.iov_len = uiop->uio_resid = bp->b_dirtyend 16289336Sdfr - bp->b_dirtyoff; 162941791Sdt uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 16309336Sdfr + bp->b_dirtyoff; 16318692Sdg io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 16328692Sdg uiop->uio_rw = UIO_WRITE; 16338692Sdg nfsstats.write_bios++; 163444679Sjulian 163525785Sdfr if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 16369336Sdfr iomode = NFSV3WRITE_UNSTABLE; 16378692Sdg else 16389336Sdfr iomode = NFSV3WRITE_FILESYNC; 163944679Sjulian 1640122953Salfred error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit); 164151475Sdillon 164251475Sdillon /* 164351475Sdillon * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 164451475Sdillon * to cluster the buffers needing commit. This will allow 164551475Sdillon * the system to submit a single commit rpc for the whole 164683651Speter * cluster. We can do this even if the buffer is not 100% 164754480Sdillon * dirty (relative to the NFS blocksize), so we optimize the 164854480Sdillon * append-to-file-case. 164954480Sdillon * 165054480Sdillon * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 165154480Sdillon * cleared because write clustering only works for commit 165254480Sdillon * rpc's, not for the data portion of the write). 165351475Sdillon */ 165451475Sdillon 165525003Sdfr if (!error && iomode == NFSV3WRITE_UNSTABLE) { 165625003Sdfr bp->b_flags |= B_NEEDCOMMIT; 165725003Sdfr if (bp->b_dirtyoff == 0 165846349Salc && bp->b_dirtyend == bp->b_bcount) 165925003Sdfr bp->b_flags |= B_CLUSTEROK; 166044679Sjulian } else { 166154480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 166244679Sjulian } 16638692Sdg 16649336Sdfr /* 16659336Sdfr * For an interrupted write, the buffer is still valid 16669336Sdfr * and the write hasn't been pushed to the server yet, 166758934Sphk * so we can't set BIO_ERROR and report the interruption 16689336Sdfr * by setting B_EINTR. For the B_ASYNC case, B_EINTR 16699336Sdfr * is not relevant, so the rpc attempt is essentially 16709336Sdfr * a noop. For the case of a V3 write rpc not being 16719336Sdfr * committed to stable storage, the block is still 16729336Sdfr * dirty and requires either a commit rpc or another 16739336Sdfr * write rpc with iomode == NFSV3WRITE_FILESYNC before 16749336Sdfr * the block is reused. This is indicated by setting 16759336Sdfr * the B_DELWRI and B_NEEDCOMMIT flags. 167642957Sdillon * 167742957Sdillon * If the buffer is marked B_PAGING, it does not reside on 167844679Sjulian * the vp's paging queues so we cannot call bdirty(). The 167944679Sjulian * bp in this case is not an NFS cache block so we should 168044679Sjulian * be safe. XXX 16819336Sdfr */ 1682131691Salfred if (error == EINTR || error == EIO 16839336Sdfr || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 168434266Sjulian int s; 168534266Sjulian 168644679Sjulian s = splbio(); 16878692Sdg bp->b_flags &= ~(B_INVAL|B_NOCACHE); 168842957Sdillon if ((bp->b_flags & B_PAGING) == 0) { 168944679Sjulian bdirty(bp); 169044679Sjulian bp->b_flags &= ~B_DONE; 169142957Sdillon } 169247749Speter if (error && (bp->b_flags & B_ASYNC) == 0) 169332755Sdyson bp->b_flags |= B_EINTR; 169444679Sjulian splx(s); 16958692Sdg } else { 169644679Sjulian if (error) { 169758934Sphk bp->b_ioflags |= BIO_ERROR; 169844679Sjulian bp->b_error = np->n_error = error; 169944679Sjulian np->n_flag |= NWRITEERR; 170044679Sjulian } 170144679Sjulian bp->b_dirtyoff = bp->b_dirtyend = 0; 17028692Sdg } 17031541Srgrimes } else { 17048692Sdg bp->b_resid = 0; 170559249Sphk bufdone(bp); 17068692Sdg return (0); 17071541Srgrimes } 17081541Srgrimes } 17091541Srgrimes bp->b_resid = uiop->uio_resid; 17109336Sdfr if (must_commit) 171144679Sjulian nfs_clearcommit(vp->v_mount); 171259249Sphk bufdone(bp); 17131541Srgrimes return (error); 17141541Srgrimes} 171587834Sdillon 171687834Sdillon/* 171787834Sdillon * Used to aid in handling ftruncate() operations on the NFS client side. 171887834Sdillon * Truncation creates a number of special problems for NFS. We have to 171987834Sdillon * throw away VM pages and buffer cache buffers that are beyond EOF, and 172087834Sdillon * we have to properly handle VM pages or (potentially dirty) buffers 172187834Sdillon * that straddle the truncation point. 172287834Sdillon */ 172387834Sdillon 172487834Sdillonint 172587834Sdillonnfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) 172687834Sdillon{ 172787834Sdillon struct nfsnode *np = VTONFS(vp); 172887834Sdillon u_quad_t tsize = np->n_size; 172987834Sdillon int biosize = vp->v_mount->mnt_stat.f_iosize; 173087834Sdillon int error = 0; 173187834Sdillon 173287834Sdillon np->n_size = nsize; 173387834Sdillon 173487834Sdillon if (np->n_size < tsize) { 173587834Sdillon struct buf *bp; 173687834Sdillon daddr_t lbn; 173787834Sdillon int bufsize; 173887834Sdillon 173987834Sdillon /* 174087834Sdillon * vtruncbuf() doesn't get the buffer overlapping the 174187834Sdillon * truncation point. We may have a B_DELWRI and/or B_CACHE 174287834Sdillon * buffer that now needs to be truncated. 174387834Sdillon */ 174487834Sdillon error = vtruncbuf(vp, cred, td, nsize, biosize); 174587834Sdillon lbn = nsize / biosize; 174687834Sdillon bufsize = nsize & (biosize - 1); 174787834Sdillon bp = nfs_getcacheblk(vp, lbn, bufsize, td); 1748138496Sps if (!bp) 1749138496Sps return EINTR; 175087834Sdillon if (bp->b_dirtyoff > bp->b_bcount) 175187834Sdillon bp->b_dirtyoff = bp->b_bcount; 175287834Sdillon if (bp->b_dirtyend > bp->b_bcount) 175387834Sdillon bp->b_dirtyend = bp->b_bcount; 175487834Sdillon bp->b_flags |= B_RELBUF; /* don't leave garbage around */ 175587834Sdillon brelse(bp); 175687834Sdillon } else { 175787834Sdillon vnode_pager_setsize(vp, nsize); 175887834Sdillon } 175987834Sdillon return(error); 176087834Sdillon} 176187834Sdillon 1762