nfs_bio.c revision 192986
1139823Simp/*- 21541Srgrimes * Copyright (c) 1989, 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * This code is derived from software contributed to Berkeley by 61541Srgrimes * Rick Macklem at The University of Guelph. 71541Srgrimes * 81541Srgrimes * Redistribution and use in source and binary forms, with or without 91541Srgrimes * modification, are permitted provided that the following conditions 101541Srgrimes * are met: 111541Srgrimes * 1. Redistributions of source code must retain the above copyright 121541Srgrimes * notice, this list of conditions and the following disclaimer. 131541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141541Srgrimes * notice, this list of conditions and the following disclaimer in the 151541Srgrimes * documentation and/or other materials provided with the distribution. 161541Srgrimes * 4. Neither the name of the University nor the names of its contributors 171541Srgrimes * may be used to endorse or promote products derived from this software 181541Srgrimes * without specific prior written permission. 191541Srgrimes * 201541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 211541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 221541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 231541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 241541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 251541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 261541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 271541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 281541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 291541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 301541Srgrimes * SUCH DAMAGE. 311541Srgrimes * 3222521Sdyson * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 331541Srgrimes */ 341541Srgrimes 3583651Speter#include <sys/cdefs.h> 3683654Speter__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_bio.c 192986 2009-05-28 18:11:09Z alc $"); 3722521Sdyson 38190380Srwatson#include "opt_kdtrace.h" 39190380Srwatson 401541Srgrimes#include <sys/param.h> 411541Srgrimes#include <sys/systm.h> 4279247Sjhb#include <sys/bio.h> 4379247Sjhb#include <sys/buf.h> 4479247Sjhb#include <sys/kernel.h> 45192578Srwatson#include <sys/mbuf.h> 4679247Sjhb#include <sys/mount.h> 4779247Sjhb#include <sys/proc.h> 481541Srgrimes#include <sys/resourcevar.h> 493305Sphk#include <sys/signalvar.h> 5079247Sjhb#include <sys/vmmeter.h> 511541Srgrimes#include <sys/vnode.h> 521541Srgrimes 531541Srgrimes#include <vm/vm.h> 5412662Sdg#include <vm/vm_extern.h> 5525930Sdfr#include <vm/vm_page.h> 5625930Sdfr#include <vm/vm_object.h> 5725930Sdfr#include <vm/vm_pager.h> 5825930Sdfr#include <vm/vnode_pager.h> 591541Srgrimes 601541Srgrimes#include <nfs/rpcv2.h> 619336Sdfr#include <nfs/nfsproto.h> 6283651Speter#include <nfsclient/nfs.h> 6383651Speter#include <nfsclient/nfsmount.h> 6483651Speter#include <nfsclient/nfsnode.h> 65190380Srwatson#include <nfsclient/nfs_kdtrace.h> 661541Srgrimes 6783651Speterstatic struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, 6883651Speter struct thread *td); 69138899Spsstatic int nfs_directio_write(struct vnode *vp, struct uio *uiop, 70138899Sps struct ucred *cred, int ioflag); 7175580Sphk 72138899Spsextern int nfs_directio_enable; 73138899Spsextern int nfs_directio_allow_mmap; 74158739Smohans 751541Srgrimes/* 7625930Sdfr * Vnode op for VM getpages. 7725930Sdfr */ 7825930Sdfrint 7983651Speternfs_getpages(struct vop_getpages_args *ap) 8025930Sdfr{ 8146349Salc int i, error, nextoff, size, toff, count, npages; 8232755Sdyson struct uio uio; 8332755Sdyson struct iovec iov; 8432755Sdyson vm_offset_t kva; 8534206Sdyson struct buf *bp; 8636563Speter struct vnode *vp; 8783366Sjulian struct thread *td; 8836563Speter struct ucred *cred; 8936563Speter struct nfsmount *nmp; 90116461Salc vm_object_t object; 9136563Speter vm_page_t *pages; 92138899Sps struct nfsnode *np; 9325930Sdfr 9436563Speter vp = ap->a_vp; 95138899Sps np = VTONFS(vp); 9683366Sjulian td = curthread; /* XXX */ 9791406Sjhb cred = curthread->td_ucred; /* XXX */ 9836563Speter nmp = VFSTONFS(vp->v_mount); 9936563Speter pages = ap->a_m; 10036563Speter count = ap->a_count; 10136563Speter 102116461Salc if ((object = vp->v_object) == NULL) { 103158739Smohans nfs_printf("nfs_getpages: called with non-merged cache vnode??\n"); 10436563Speter return VM_PAGER_ERROR; 10525930Sdfr } 10625930Sdfr 107158739Smohans if (nfs_directio_enable && !nfs_directio_allow_mmap) { 108158739Smohans mtx_lock(&np->n_mtx); 109158739Smohans if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { 110158739Smohans mtx_unlock(&np->n_mtx); 111158739Smohans nfs_printf("nfs_getpages: called on non-cacheable vnode??\n"); 112158739Smohans return VM_PAGER_ERROR; 113158739Smohans } else 114158739Smohans mtx_unlock(&np->n_mtx); 115138899Sps } 116138899Sps 117158739Smohans mtx_lock(&nmp->nm_mtx); 11836563Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 119158739Smohans (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 120158739Smohans mtx_unlock(&nmp->nm_mtx); 121122698Salfred /* We'll never get here for v4, because we always have fsinfo */ 12283366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 123158739Smohans } else 124158739Smohans mtx_unlock(&nmp->nm_mtx); 12546349Salc 12646349Salc npages = btoc(count); 12746349Salc 12834206Sdyson /* 12946349Salc * If the requested page is partially valid, just return it and 13046349Salc * allow the pager to zero-out the blanks. Partially valid pages 13146349Salc * can only occur at the file EOF. 13246349Salc */ 13346349Salc 13446349Salc { 13546349Salc vm_page_t m = pages[ap->a_reqpage]; 13646349Salc 137116461Salc VM_OBJECT_LOCK(object); 13846349Salc if (m->valid != 0) { 139191964Salc vm_page_lock_queues(); 14046349Salc for (i = 0; i < npages; ++i) { 14146349Salc if (i != ap->a_reqpage) 14275692Salfred vm_page_free(pages[i]); 14346349Salc } 144100450Salc vm_page_unlock_queues(); 145116461Salc VM_OBJECT_UNLOCK(object); 14646349Salc return(0); 14746349Salc } 148116461Salc VM_OBJECT_UNLOCK(object); 14946349Salc } 15046349Salc 15146349Salc /* 15234206Sdyson * We use only the kva address for the buffer, but this is extremely 15334206Sdyson * convienient and fast. 15434206Sdyson */ 15542957Sdillon bp = getpbuf(&nfs_pbuf_freecnt); 15625930Sdfr 15734206Sdyson kva = (vm_offset_t) bp->b_data; 15836563Speter pmap_qenter(kva, pages, npages); 159170292Sattilio PCPU_INC(cnt.v_vnodein); 160170292Sattilio PCPU_ADD(cnt.v_vnodepgsin, npages); 16134206Sdyson 16232755Sdyson iov.iov_base = (caddr_t) kva; 16336563Speter iov.iov_len = count; 16432755Sdyson uio.uio_iov = &iov; 16532755Sdyson uio.uio_iovcnt = 1; 16636563Speter uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 16736563Speter uio.uio_resid = count; 16832755Sdyson uio.uio_segflg = UIO_SYSSPACE; 16932755Sdyson uio.uio_rw = UIO_READ; 17083366Sjulian uio.uio_td = td; 17125930Sdfr 172122953Salfred error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred); 17334206Sdyson pmap_qremove(kva, npages); 17432755Sdyson 17542957Sdillon relpbuf(bp, &nfs_pbuf_freecnt); 17634206Sdyson 17742957Sdillon if (error && (uio.uio_resid == count)) { 178158739Smohans nfs_printf("nfs_getpages: error %d\n", error); 179116461Salc VM_OBJECT_LOCK(object); 180100450Salc vm_page_lock_queues(); 18142957Sdillon for (i = 0; i < npages; ++i) { 18242957Sdillon if (i != ap->a_reqpage) 18375692Salfred vm_page_free(pages[i]); 18442957Sdillon } 185100450Salc vm_page_unlock_queues(); 186116461Salc VM_OBJECT_UNLOCK(object); 18734206Sdyson return VM_PAGER_ERROR; 18842957Sdillon } 18934206Sdyson 19045347Sjulian /* 19145347Sjulian * Calculate the number of bytes read and validate only that number 19245347Sjulian * of bytes. Note that due to pending writes, size may be 0. This 19345347Sjulian * does not mean that the remaining data is invalid! 19445347Sjulian */ 19545347Sjulian 19636563Speter size = count - uio.uio_resid; 197116461Salc VM_OBJECT_LOCK(object); 198100450Salc vm_page_lock_queues(); 19934206Sdyson for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 20034206Sdyson vm_page_t m; 20134206Sdyson nextoff = toff + PAGE_SIZE; 20236563Speter m = pages[i]; 20334206Sdyson 20434206Sdyson if (nextoff <= size) { 20545347Sjulian /* 20645347Sjulian * Read operation filled an entire page 20745347Sjulian */ 20834206Sdyson m->valid = VM_PAGE_BITS_ALL; 209192010Salc KASSERT(m->dirty == 0, 210192010Salc ("nfs_getpages: page %p is dirty", m)); 21145347Sjulian } else if (size > toff) { 21245347Sjulian /* 21346349Salc * Read operation filled a partial page. 21445347Sjulian */ 21546349Salc m->valid = 0; 216192134Salc vm_page_set_valid(m, 0, size - toff); 217192986Salc KASSERT(m->dirty == 0, 218192134Salc ("nfs_getpages: page %p is dirty", m)); 21987834Sdillon } else { 22087834Sdillon /* 22187834Sdillon * Read operation was short. If no error occured 22287834Sdillon * we may have hit a zero-fill section. We simply 22387834Sdillon * leave valid set to 0. 22487834Sdillon */ 22587834Sdillon ; 22634206Sdyson } 22725930Sdfr if (i != ap->a_reqpage) { 22834206Sdyson /* 22934206Sdyson * Whether or not to leave the page activated is up in 23034206Sdyson * the air, but we should put the page on a page queue 23134206Sdyson * somewhere (it already is in the object). Result: 23234206Sdyson * It appears that emperical results show that 23334206Sdyson * deactivating pages is best. 23434206Sdyson */ 23534206Sdyson 23634206Sdyson /* 23734206Sdyson * Just in case someone was asking for this page we 23834206Sdyson * now tell them that it is ok to use. 23934206Sdyson */ 24034206Sdyson if (!error) { 241161125Salc if (m->oflags & VPO_WANTED) 24234206Sdyson vm_page_activate(m); 24334206Sdyson else 24434206Sdyson vm_page_deactivate(m); 24538799Sdfr vm_page_wakeup(m); 24634206Sdyson } else { 24775692Salfred vm_page_free(m); 24834206Sdyson } 24925930Sdfr } 25025930Sdfr } 251100450Salc vm_page_unlock_queues(); 252116461Salc VM_OBJECT_UNLOCK(object); 25325930Sdfr return 0; 25425930Sdfr} 25525930Sdfr 25625930Sdfr/* 25734206Sdyson * Vnode op for VM putpages. 25834096Smsmith */ 25934096Smsmithint 26083651Speternfs_putpages(struct vop_putpages_args *ap) 26134096Smsmith{ 26234206Sdyson struct uio uio; 26334206Sdyson struct iovec iov; 26434206Sdyson vm_offset_t kva; 26534206Sdyson struct buf *bp; 26636563Speter int iomode, must_commit, i, error, npages, count; 26746349Salc off_t offset; 26834206Sdyson int *rtvals; 26936563Speter struct vnode *vp; 27083366Sjulian struct thread *td; 27136563Speter struct ucred *cred; 27236563Speter struct nfsmount *nmp; 27346349Salc struct nfsnode *np; 27436563Speter vm_page_t *pages; 27534206Sdyson 27636563Speter vp = ap->a_vp; 27746349Salc np = VTONFS(vp); 27883366Sjulian td = curthread; /* XXX */ 27991406Sjhb cred = curthread->td_ucred; /* XXX */ 28036563Speter nmp = VFSTONFS(vp->v_mount); 28136563Speter pages = ap->a_m; 28236563Speter count = ap->a_count; 28334206Sdyson rtvals = ap->a_rtvals; 28436563Speter npages = btoc(count); 28546349Salc offset = IDX_TO_OFF(pages[0]->pindex); 286158739Smohans 287158739Smohans mtx_lock(&nmp->nm_mtx); 28836563Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 28976827Salfred (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 290158739Smohans mtx_unlock(&nmp->nm_mtx); 29183366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 292158739Smohans } else 293158739Smohans mtx_unlock(&nmp->nm_mtx); 29434206Sdyson 295158739Smohans mtx_lock(&np->n_mtx); 296157557Smohans if (nfs_directio_enable && !nfs_directio_allow_mmap && 297158739Smohans (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { 298158739Smohans mtx_unlock(&np->n_mtx); 299158739Smohans nfs_printf("nfs_putpages: called on noncache-able vnode??\n"); 300158739Smohans mtx_lock(&np->n_mtx); 301158739Smohans } 302138899Sps 30383651Speter for (i = 0; i < npages; i++) 30434206Sdyson rtvals[i] = VM_PAGER_AGAIN; 30534206Sdyson 30634206Sdyson /* 30746349Salc * When putting pages, do not extend file past EOF. 30846349Salc */ 30946349Salc if (offset + count > np->n_size) { 31046349Salc count = np->n_size - offset; 31146349Salc if (count < 0) 31246349Salc count = 0; 31346349Salc } 314158739Smohans mtx_unlock(&np->n_mtx); 31546349Salc 31646349Salc /* 31734206Sdyson * We use only the kva address for the buffer, but this is extremely 31834206Sdyson * convienient and fast. 31934206Sdyson */ 32042957Sdillon bp = getpbuf(&nfs_pbuf_freecnt); 32134206Sdyson 32234206Sdyson kva = (vm_offset_t) bp->b_data; 32336563Speter pmap_qenter(kva, pages, npages); 324170292Sattilio PCPU_INC(cnt.v_vnodeout); 325170292Sattilio PCPU_ADD(cnt.v_vnodepgsout, count); 32634206Sdyson 32734206Sdyson iov.iov_base = (caddr_t) kva; 32836563Speter iov.iov_len = count; 32934206Sdyson uio.uio_iov = &iov; 33034206Sdyson uio.uio_iovcnt = 1; 33146349Salc uio.uio_offset = offset; 33236563Speter uio.uio_resid = count; 33334206Sdyson uio.uio_segflg = UIO_SYSSPACE; 33434206Sdyson uio.uio_rw = UIO_WRITE; 33583366Sjulian uio.uio_td = td; 33634206Sdyson 33734206Sdyson if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 33834206Sdyson iomode = NFSV3WRITE_UNSTABLE; 33934206Sdyson else 34034206Sdyson iomode = NFSV3WRITE_FILESYNC; 34134206Sdyson 342122953Salfred error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit); 34334206Sdyson 34434206Sdyson pmap_qremove(kva, npages); 34542957Sdillon relpbuf(bp, &nfs_pbuf_freecnt); 34634206Sdyson 34734206Sdyson if (!error) { 34836563Speter int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 34934206Sdyson for (i = 0; i < nwritten; i++) { 35034206Sdyson rtvals[i] = VM_PAGER_OK; 35149945Salc vm_page_undirty(pages[i]); 35234206Sdyson } 35376827Salfred if (must_commit) { 35436563Speter nfs_clearcommit(vp->v_mount); 35576827Salfred } 35634206Sdyson } 35736563Speter return rtvals[0]; 35834096Smsmith} 35934096Smsmith 36034096Smsmith/* 361158739Smohans * For nfs, cache consistency can only be maintained approximately. 362158739Smohans * Although RFC1094 does not specify the criteria, the following is 363158739Smohans * believed to be compatible with the reference port. 364158739Smohans * For nfs: 365158739Smohans * If the file's modify time on the server has changed since the 366158739Smohans * last read rpc or you have written to the file, 367158739Smohans * you may have lost data cache consistency with the 368158739Smohans * server, so flush all of the file's data out of the cache. 369158739Smohans * Then force a getattr rpc to ensure that you have up to date 370158739Smohans * attributes. 371158739Smohans * NB: This implies that cache data can be read when up to 372158739Smohans * NFS_ATTRTIMEO seconds out of date. If you find that you need current 373158739Smohans * attributes this could be forced by setting n_attrstamp to 0 before 374158739Smohans * the VOP_GETATTR() call. 375158739Smohans */ 376158739Smohansstatic inline int 377158739Smohansnfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred) 378158739Smohans{ 379158739Smohans int error = 0; 380158739Smohans struct vattr vattr; 381158739Smohans struct nfsnode *np = VTONFS(vp); 382158739Smohans int old_lock; 383158739Smohans struct nfsmount *nmp = VFSTONFS(vp->v_mount); 384158739Smohans 385158739Smohans /* 386158739Smohans * Grab the exclusive lock before checking whether the cache is 387158739Smohans * consistent. 388158739Smohans * XXX - We can make this cheaper later (by acquiring cheaper locks). 389158739Smohans * But for now, this suffices. 390158739Smohans */ 391176134Sattilio old_lock = nfs_upgrade_vnlock(vp); 392158739Smohans mtx_lock(&np->n_mtx); 393158739Smohans if (np->n_flag & NMODIFIED) { 394158739Smohans mtx_unlock(&np->n_mtx); 395158739Smohans if (vp->v_type != VREG) { 396158739Smohans if (vp->v_type != VDIR) 397158739Smohans panic("nfs: bioread, not dir"); 398158739Smohans (nmp->nm_rpcops->nr_invaldir)(vp); 399158739Smohans error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 400158739Smohans if (error) 401158739Smohans goto out; 402158739Smohans } 403158739Smohans np->n_attrstamp = 0; 404190380Srwatson KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); 405182371Sattilio error = VOP_GETATTR(vp, &vattr, cred); 406158739Smohans if (error) 407158739Smohans goto out; 408158739Smohans mtx_lock(&np->n_mtx); 409158739Smohans np->n_mtime = vattr.va_mtime; 410158739Smohans mtx_unlock(&np->n_mtx); 411158739Smohans } else { 412158739Smohans mtx_unlock(&np->n_mtx); 413182371Sattilio error = VOP_GETATTR(vp, &vattr, cred); 414158739Smohans if (error) 415158739Smohans return (error); 416158739Smohans mtx_lock(&np->n_mtx); 417158739Smohans if ((np->n_flag & NSIZECHANGED) 418158739Smohans || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) { 419158739Smohans mtx_unlock(&np->n_mtx); 420158739Smohans if (vp->v_type == VDIR) 421158739Smohans (nmp->nm_rpcops->nr_invaldir)(vp); 422158739Smohans error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 423158739Smohans if (error) 424158739Smohans goto out; 425158739Smohans mtx_lock(&np->n_mtx); 426158739Smohans np->n_mtime = vattr.va_mtime; 427158739Smohans np->n_flag &= ~NSIZECHANGED; 428158739Smohans } 429158739Smohans mtx_unlock(&np->n_mtx); 430158739Smohans } 431158739Smohansout: 432176134Sattilio nfs_downgrade_vnlock(vp, old_lock); 433158739Smohans return error; 434158739Smohans} 435158739Smohans 436158739Smohans/* 4371541Srgrimes * Vnode op for read using bio 4381541Srgrimes */ 4391549Srgrimesint 44083651Speternfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) 4411541Srgrimes{ 44283651Speter struct nfsnode *np = VTONFS(vp); 44383651Speter int biosize, i; 444143822Sdas struct buf *bp, *rabp; 44583366Sjulian struct thread *td; 4469336Sdfr struct nfsmount *nmp = VFSTONFS(vp->v_mount); 4475455Sdg daddr_t lbn, rabn; 44846349Salc int bcount; 44951344Sdillon int seqcount; 45046349Salc int nra, error = 0, n = 0, on = 0; 4511541Srgrimes 4521541Srgrimes#ifdef DIAGNOSTIC 4531541Srgrimes if (uio->uio_rw != UIO_READ) 4541541Srgrimes panic("nfs_read mode"); 4551541Srgrimes#endif 4561541Srgrimes if (uio->uio_resid == 0) 4571541Srgrimes return (0); 45836473Speter if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 4591541Srgrimes return (EINVAL); 46083366Sjulian td = uio->uio_td; 46151344Sdillon 462158739Smohans mtx_lock(&nmp->nm_mtx); 46336176Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 464158739Smohans (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 465158739Smohans mtx_unlock(&nmp->nm_mtx); 46683366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 467158739Smohans } else 468158739Smohans mtx_unlock(&nmp->nm_mtx); 469158739Smohans 47036473Speter if (vp->v_type != VDIR && 47136473Speter (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 47236473Speter return (EFBIG); 473138899Sps 474138899Sps if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG)) 475138899Sps /* No caching/ no readaheads. Just read data into the user buffer */ 476138899Sps return nfs_readrpc(vp, uio, cred); 477138899Sps 4789428Sdfr biosize = vp->v_mount->mnt_stat.f_iosize; 479108357Sdillon seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); 480158739Smohans 481158739Smohans error = nfs_bioread_check_cons(vp, td, cred); 482158739Smohans if (error) 483158739Smohans return error; 484158739Smohans 48583651Speter do { 486158739Smohans u_quad_t nsize; 487158739Smohans 488158739Smohans mtx_lock(&np->n_mtx); 489158739Smohans nsize = np->n_size; 490158739Smohans mtx_unlock(&np->n_mtx); 491158739Smohans 4921541Srgrimes switch (vp->v_type) { 4931541Srgrimes case VREG: 4941541Srgrimes nfsstats.biocache_reads++; 4951541Srgrimes lbn = uio->uio_offset / biosize; 4969336Sdfr on = uio->uio_offset & (biosize - 1); 4971541Srgrimes 4981541Srgrimes /* 4991541Srgrimes * Start the read ahead(s), as required. 5001541Srgrimes */ 501158739Smohans if (nmp->nm_readahead > 0) { 50251344Sdillon for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 503158739Smohans (off_t)(lbn + 1 + nra) * biosize < nsize; nra++) { 5045455Sdg rabn = lbn + 1 + nra; 505136767Sphk if (incore(&vp->v_bufobj, rabn) == NULL) { 50683366Sjulian rabp = nfs_getcacheblk(vp, rabn, biosize, td); 507131691Salfred if (!rabp) { 508131691Salfred error = nfs_sigintr(nmp, NULL, td); 509131691Salfred return (error ? error : EINTR); 510131691Salfred } 5118692Sdg if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 51258345Sphk rabp->b_flags |= B_ASYNC; 51358345Sphk rabp->b_iocmd = BIO_READ; 5145455Sdg vfs_busy_pages(rabp, 0); 515134898Sphk if (nfs_asyncio(nmp, rabp, cred, td)) { 51658934Sphk rabp->b_flags |= B_INVAL; 51758934Sphk rabp->b_ioflags |= BIO_ERROR; 5185455Sdg vfs_unbusy_pages(rabp); 5191541Srgrimes brelse(rabp); 52055431Sdillon break; 5211541Srgrimes } 52255431Sdillon } else { 5235471Sdg brelse(rabp); 52455431Sdillon } 5251541Srgrimes } 5261541Srgrimes } 5271541Srgrimes } 5281541Srgrimes 529148268Sps /* Note that bcount is *not* DEV_BSIZE aligned. */ 53046349Salc bcount = biosize; 531158739Smohans if ((off_t)lbn * biosize >= nsize) { 53246349Salc bcount = 0; 533158739Smohans } else if ((off_t)(lbn + 1) * biosize > nsize) { 534158739Smohans bcount = nsize - (off_t)lbn * biosize; 5358692Sdg } 53683366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 53754605Sdillon 538131691Salfred if (!bp) { 539131691Salfred error = nfs_sigintr(nmp, NULL, td); 540131691Salfred return (error ? error : EINTR); 541131691Salfred } 54242957Sdillon 54325930Sdfr /* 54446349Salc * If B_CACHE is not set, we must issue the read. If this 54546349Salc * fails, we return an error. 54625930Sdfr */ 54746349Salc 5487871Sdg if ((bp->b_flags & B_CACHE) == 0) { 54958345Sphk bp->b_iocmd = BIO_READ; 55032755Sdyson vfs_busy_pages(bp, 0); 551134898Sphk error = nfs_doio(vp, bp, cred, td); 55232755Sdyson if (error) { 55332755Sdyson brelse(bp); 55432755Sdyson return (error); 55532755Sdyson } 5561541Srgrimes } 55746349Salc 55846349Salc /* 55946349Salc * on is the offset into the current bp. Figure out how many 56046349Salc * bytes we can copy out of the bp. Note that bcount is 56146349Salc * NOT DEV_BSIZE aligned. 56246349Salc * 56346349Salc * Then figure out how many bytes we can copy into the uio. 56446349Salc */ 56546349Salc 56646349Salc n = 0; 56746349Salc if (on < bcount) 56846349Salc n = min((unsigned)(bcount - on), uio->uio_resid); 5691541Srgrimes break; 5701541Srgrimes case VLNK: 5711541Srgrimes nfsstats.biocache_readlinks++; 57283366Sjulian bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 573131691Salfred if (!bp) { 574131691Salfred error = nfs_sigintr(nmp, NULL, td); 575131691Salfred return (error ? error : EINTR); 576131691Salfred } 5777871Sdg if ((bp->b_flags & B_CACHE) == 0) { 57858345Sphk bp->b_iocmd = BIO_READ; 57932755Sdyson vfs_busy_pages(bp, 0); 580134898Sphk error = nfs_doio(vp, bp, cred, td); 58132755Sdyson if (error) { 58258934Sphk bp->b_ioflags |= BIO_ERROR; 58332755Sdyson brelse(bp); 58432755Sdyson return (error); 58532755Sdyson } 5861541Srgrimes } 5871541Srgrimes n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 5881541Srgrimes on = 0; 5891541Srgrimes break; 5901541Srgrimes case VDIR: 5911541Srgrimes nfsstats.biocache_readdirs++; 59224577Sdfr if (np->n_direofoffset 59324577Sdfr && uio->uio_offset >= np->n_direofoffset) { 59424577Sdfr return (0); 59524577Sdfr } 59636979Sbde lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 5979336Sdfr on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 59883366Sjulian bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 599131691Salfred if (!bp) { 600131691Salfred error = nfs_sigintr(nmp, NULL, td); 601131691Salfred return (error ? error : EINTR); 602131691Salfred } 6037871Sdg if ((bp->b_flags & B_CACHE) == 0) { 60458345Sphk bp->b_iocmd = BIO_READ; 6059336Sdfr vfs_busy_pages(bp, 0); 606134898Sphk error = nfs_doio(vp, bp, cred, td); 60732912Stegge if (error) { 60832912Stegge brelse(bp); 60932912Stegge } 61032755Sdyson while (error == NFSERR_BAD_COOKIE) { 611122953Salfred (nmp->nm_rpcops->nr_invaldir)(vp); 612140731Sphk error = nfs_vinvalbuf(vp, 0, td, 1); 61332755Sdyson /* 61432755Sdyson * Yuck! The directory has been modified on the 61532755Sdyson * server. The only way to get the block is by 61632755Sdyson * reading from the beginning to get all the 61732755Sdyson * offset cookies. 61846349Salc * 61946349Salc * Leave the last bp intact unless there is an error. 62046349Salc * Loop back up to the while if the error is another 62146349Salc * NFSERR_BAD_COOKIE (double yuch!). 62232755Sdyson */ 62332755Sdyson for (i = 0; i <= lbn && !error; i++) { 62432755Sdyson if (np->n_direofoffset 62532755Sdyson && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 62624577Sdfr return (0); 62783366Sjulian bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 628131691Salfred if (!bp) { 629131691Salfred error = nfs_sigintr(nmp, NULL, td); 630131691Salfred return (error ? error : EINTR); 631131691Salfred } 63246349Salc if ((bp->b_flags & B_CACHE) == 0) { 63358345Sphk bp->b_iocmd = BIO_READ; 63446349Salc vfs_busy_pages(bp, 0); 635134898Sphk error = nfs_doio(vp, bp, cred, td); 63646349Salc /* 63746349Salc * no error + B_INVAL == directory EOF, 63846349Salc * use the block. 63946349Salc */ 64046349Salc if (error == 0 && (bp->b_flags & B_INVAL)) 64146349Salc break; 64246349Salc } 64346349Salc /* 64446349Salc * An error will throw away the block and the 64546349Salc * for loop will break out. If no error and this 64646349Salc * is not the block we want, we throw away the 64746349Salc * block and go for the next one via the for loop. 64846349Salc */ 64946349Salc if (error || i < lbn) 65032755Sdyson brelse(bp); 6511541Srgrimes } 65232912Stegge } 65346349Salc /* 65446349Salc * The above while is repeated if we hit another cookie 65546349Salc * error. If we hit an error and it wasn't a cookie error, 65646349Salc * we give up. 65746349Salc */ 65832912Stegge if (error) 6599336Sdfr return (error); 6601541Srgrimes } 6611541Srgrimes 6621541Srgrimes /* 6631541Srgrimes * If not eof and read aheads are enabled, start one. 6641541Srgrimes * (You need the current block first, so that you have the 6659336Sdfr * directory offset cookie of the next block.) 6661541Srgrimes */ 66789324Speter if (nmp->nm_readahead > 0 && 66839782Smckusick (bp->b_flags & B_INVAL) == 0 && 6699336Sdfr (np->n_direofoffset == 0 || 6709336Sdfr (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 671136767Sphk incore(&vp->v_bufobj, lbn + 1) == NULL) { 67283366Sjulian rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 6731541Srgrimes if (rabp) { 6748692Sdg if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 67558345Sphk rabp->b_flags |= B_ASYNC; 67658345Sphk rabp->b_iocmd = BIO_READ; 6775455Sdg vfs_busy_pages(rabp, 0); 678134898Sphk if (nfs_asyncio(nmp, rabp, cred, td)) { 67958934Sphk rabp->b_flags |= B_INVAL; 68058934Sphk rabp->b_ioflags |= BIO_ERROR; 6815455Sdg vfs_unbusy_pages(rabp); 6821541Srgrimes brelse(rabp); 6831541Srgrimes } 6845471Sdg } else { 6855471Sdg brelse(rabp); 6861541Srgrimes } 6871541Srgrimes } 6881541Srgrimes } 68926469Sdfr /* 69046349Salc * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 69146349Salc * chopped for the EOF condition, we cannot tell how large 69246349Salc * NFS directories are going to be until we hit EOF. So 69346349Salc * an NFS directory buffer is *not* chopped to its EOF. Now, 69446349Salc * it just so happens that b_resid will effectively chop it 69546349Salc * to EOF. *BUT* this information is lost if the buffer goes 69646349Salc * away and is reconstituted into a B_CACHE state ( due to 69746349Salc * being VMIO ) later. So we keep track of the directory eof 69883651Speter * in np->n_direofoffset and chop it off as an extra step 69946349Salc * right here. 70026469Sdfr */ 70126469Sdfr n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 70246349Salc if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 70346349Salc n = np->n_direofoffset - uio->uio_offset; 7041541Srgrimes break; 7053305Sphk default: 706158739Smohans nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 707143822Sdas bp = NULL; 7083305Sphk break; 7091541Srgrimes }; 7101541Srgrimes 7111541Srgrimes if (n > 0) { 71234206Sdyson error = uiomove(bp->b_data + on, (int)n, uio); 7131541Srgrimes } 714143822Sdas if (vp->v_type == VLNK) 7151541Srgrimes n = 0; 716143822Sdas if (bp != NULL) 717143822Sdas brelse(bp); 7181541Srgrimes } while (error == 0 && uio->uio_resid > 0 && n > 0); 7191541Srgrimes return (error); 7201541Srgrimes} 7211541Srgrimes 7221541Srgrimes/* 723138899Sps * The NFS write path cannot handle iovecs with len > 1. So we need to 724138899Sps * break up iovecs accordingly (restricting them to wsize). 725138899Sps * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). 726138899Sps * For the ASYNC case, 2 copies are needed. The first a copy from the 727138899Sps * user buffer to a staging buffer and then a second copy from the staging 728138899Sps * buffer to mbufs. This can be optimized by copying from the user buffer 729138899Sps * directly into mbufs and passing the chain down, but that requires a 730138899Sps * fair amount of re-working of the relevant codepaths (and can be done 731138899Sps * later). 732138899Sps */ 733138899Spsstatic int 734138899Spsnfs_directio_write(vp, uiop, cred, ioflag) 735138899Sps struct vnode *vp; 736138899Sps struct uio *uiop; 737138899Sps struct ucred *cred; 738138899Sps int ioflag; 739138899Sps{ 740138899Sps int error; 741138899Sps struct nfsmount *nmp = VFSTONFS(vp->v_mount); 742138899Sps struct thread *td = uiop->uio_td; 743138899Sps int size; 744158739Smohans int wsize; 745158739Smohans 746158739Smohans mtx_lock(&nmp->nm_mtx); 747158739Smohans wsize = nmp->nm_wsize; 748158739Smohans mtx_unlock(&nmp->nm_mtx); 749138899Sps if (ioflag & IO_SYNC) { 750138899Sps int iomode, must_commit; 751138899Sps struct uio uio; 752138899Sps struct iovec iov; 753138899Spsdo_sync: 754138899Sps while (uiop->uio_resid > 0) { 755158739Smohans size = min(uiop->uio_resid, wsize); 756138899Sps size = min(uiop->uio_iov->iov_len, size); 757138899Sps iov.iov_base = uiop->uio_iov->iov_base; 758138899Sps iov.iov_len = size; 759138899Sps uio.uio_iov = &iov; 760138899Sps uio.uio_iovcnt = 1; 761138899Sps uio.uio_offset = uiop->uio_offset; 762138899Sps uio.uio_resid = size; 763138899Sps uio.uio_segflg = UIO_USERSPACE; 764138899Sps uio.uio_rw = UIO_WRITE; 765138899Sps uio.uio_td = td; 766138899Sps iomode = NFSV3WRITE_FILESYNC; 767138899Sps error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, 768138899Sps &iomode, &must_commit); 769138899Sps KASSERT((must_commit == 0), 770138899Sps ("nfs_directio_write: Did not commit write")); 771138899Sps if (error) 772138899Sps return (error); 773138899Sps uiop->uio_offset += size; 774138899Sps uiop->uio_resid -= size; 775138899Sps if (uiop->uio_iov->iov_len <= size) { 776138899Sps uiop->uio_iovcnt--; 777138899Sps uiop->uio_iov++; 778138899Sps } else { 779138899Sps uiop->uio_iov->iov_base = 780138899Sps (char *)uiop->uio_iov->iov_base + size; 781138899Sps uiop->uio_iov->iov_len -= size; 782138899Sps } 783138899Sps } 784138899Sps } else { 785138899Sps struct uio *t_uio; 786138899Sps struct iovec *t_iov; 787138899Sps struct buf *bp; 788138899Sps 789138899Sps /* 790138899Sps * Break up the write into blocksize chunks and hand these 791138899Sps * over to nfsiod's for write back. 792138899Sps * Unfortunately, this incurs a copy of the data. Since 793138899Sps * the user could modify the buffer before the write is 794138899Sps * initiated. 795138899Sps * 796138899Sps * The obvious optimization here is that one of the 2 copies 797138899Sps * in the async write path can be eliminated by copying the 798138899Sps * data here directly into mbufs and passing the mbuf chain 799138899Sps * down. But that will require a fair amount of re-working 800138899Sps * of the code and can be done if there's enough interest 801138899Sps * in NFS directio access. 802138899Sps */ 803138899Sps while (uiop->uio_resid > 0) { 804158739Smohans size = min(uiop->uio_resid, wsize); 805138899Sps size = min(uiop->uio_iov->iov_len, size); 806138899Sps bp = getpbuf(&nfs_pbuf_freecnt); 807138899Sps t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); 808138899Sps t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); 809138899Sps t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); 810138899Sps t_iov->iov_len = size; 811138899Sps t_uio->uio_iov = t_iov; 812138899Sps t_uio->uio_iovcnt = 1; 813138899Sps t_uio->uio_offset = uiop->uio_offset; 814138899Sps t_uio->uio_resid = size; 815138899Sps t_uio->uio_segflg = UIO_SYSSPACE; 816138899Sps t_uio->uio_rw = UIO_WRITE; 817138899Sps t_uio->uio_td = td; 818138899Sps bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size); 819138899Sps bp->b_flags |= B_DIRECT; 820138899Sps bp->b_iocmd = BIO_WRITE; 821138899Sps if (cred != NOCRED) { 822138899Sps crhold(cred); 823138899Sps bp->b_wcred = cred; 824138899Sps } else 825138899Sps bp->b_wcred = NOCRED; 826138899Sps bp->b_caller1 = (void *)t_uio; 827138899Sps bp->b_vp = vp; 828138899Sps error = nfs_asyncio(nmp, bp, NOCRED, td); 829138899Sps if (error) { 830138899Sps free(t_iov->iov_base, M_NFSDIRECTIO); 831138899Sps free(t_iov, M_NFSDIRECTIO); 832138899Sps free(t_uio, M_NFSDIRECTIO); 833138899Sps bp->b_vp = NULL; 834138899Sps relpbuf(bp, &nfs_pbuf_freecnt); 835138899Sps if (error == EINTR) 836138899Sps return (error); 837138899Sps goto do_sync; 838138899Sps } 839138899Sps uiop->uio_offset += size; 840138899Sps uiop->uio_resid -= size; 841138899Sps if (uiop->uio_iov->iov_len <= size) { 842138899Sps uiop->uio_iovcnt--; 843138899Sps uiop->uio_iov++; 844138899Sps } else { 845138899Sps uiop->uio_iov->iov_base = 846138899Sps (char *)uiop->uio_iov->iov_base + size; 847138899Sps uiop->uio_iov->iov_len -= size; 848138899Sps } 849138899Sps } 850138899Sps } 851138899Sps return (0); 852138899Sps} 853138899Sps 854138899Sps/* 8551541Srgrimes * Vnode op for write using bio 8561541Srgrimes */ 8571549Srgrimesint 85883651Speternfs_write(struct vop_write_args *ap) 8591541Srgrimes{ 86046349Salc int biosize; 86146349Salc struct uio *uio = ap->a_uio; 86283366Sjulian struct thread *td = uio->uio_td; 86346349Salc struct vnode *vp = ap->a_vp; 8641541Srgrimes struct nfsnode *np = VTONFS(vp); 86546349Salc struct ucred *cred = ap->a_cred; 8661541Srgrimes int ioflag = ap->a_ioflag; 8671541Srgrimes struct buf *bp; 8681541Srgrimes struct vattr vattr; 8699336Sdfr struct nfsmount *nmp = VFSTONFS(vp->v_mount); 87011921Sphk daddr_t lbn; 87146349Salc int bcount; 87283651Speter int n, on, error = 0; 87383366Sjulian struct proc *p = td?td->td_proc:NULL; 8741541Srgrimes 8751541Srgrimes#ifdef DIAGNOSTIC 8761541Srgrimes if (uio->uio_rw != UIO_WRITE) 8771541Srgrimes panic("nfs_write mode"); 87883366Sjulian if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) 8791541Srgrimes panic("nfs_write proc"); 8801541Srgrimes#endif 8811541Srgrimes if (vp->v_type != VREG) 8821541Srgrimes return (EIO); 883158739Smohans mtx_lock(&np->n_mtx); 8841541Srgrimes if (np->n_flag & NWRITEERR) { 8851541Srgrimes np->n_flag &= ~NWRITEERR; 886158739Smohans mtx_unlock(&np->n_mtx); 8871541Srgrimes return (np->n_error); 888158739Smohans } else 889158739Smohans mtx_unlock(&np->n_mtx); 890158739Smohans mtx_lock(&nmp->nm_mtx); 89136176Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 892158739Smohans (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 893158739Smohans mtx_unlock(&nmp->nm_mtx); 89483366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 895158739Smohans } else 896158739Smohans mtx_unlock(&nmp->nm_mtx); 89754605Sdillon 89854605Sdillon /* 89954605Sdillon * Synchronously flush pending buffers if we are in synchronous 90054605Sdillon * mode or if we are appending. 90154605Sdillon */ 9021541Srgrimes if (ioflag & (IO_APPEND | IO_SYNC)) { 903158739Smohans mtx_lock(&np->n_mtx); 9041541Srgrimes if (np->n_flag & NMODIFIED) { 905158739Smohans mtx_unlock(&np->n_mtx); 906147420Sgreen#ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */ 907147280Sgreen /* 908147280Sgreen * Require non-blocking, synchronous writes to 909147280Sgreen * dirty files to inform the program it needs 910147280Sgreen * to fsync(2) explicitly. 911147280Sgreen */ 912147280Sgreen if (ioflag & IO_NDELAY) 913147280Sgreen return (EAGAIN); 914147420Sgreen#endif 915147280Sgreenflush_and_restart: 9161541Srgrimes np->n_attrstamp = 0; 917190380Srwatson KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); 918140731Sphk error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 9193305Sphk if (error) 9201541Srgrimes return (error); 921158739Smohans } else 922158739Smohans mtx_unlock(&np->n_mtx); 9231541Srgrimes } 92454605Sdillon 92554605Sdillon /* 92654605Sdillon * If IO_APPEND then load uio_offset. We restart here if we cannot 92754605Sdillon * get the append lock. 92854605Sdillon */ 92954605Sdillon if (ioflag & IO_APPEND) { 93054605Sdillon np->n_attrstamp = 0; 931190380Srwatson KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); 932182371Sattilio error = VOP_GETATTR(vp, &vattr, cred); 93354605Sdillon if (error) 93454605Sdillon return (error); 935158739Smohans mtx_lock(&np->n_mtx); 93654605Sdillon uio->uio_offset = np->n_size; 937158739Smohans mtx_unlock(&np->n_mtx); 93854605Sdillon } 93954605Sdillon 9401541Srgrimes if (uio->uio_offset < 0) 9411541Srgrimes return (EINVAL); 94236473Speter if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 94336473Speter return (EFBIG); 9441541Srgrimes if (uio->uio_resid == 0) 9451541Srgrimes return (0); 94654605Sdillon 947138899Sps if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG) 948138899Sps return nfs_directio_write(vp, uio, cred, ioflag); 949138899Sps 9501541Srgrimes /* 9511541Srgrimes * Maybe this should be above the vnode op call, but so long as 9521541Srgrimes * file servers have no limits, i don't think it matters 9531541Srgrimes */ 954125454Sjhb if (p != NULL) { 95573929Sjhb PROC_LOCK(p); 956125454Sjhb if (uio->uio_offset + uio->uio_resid > 957125454Sjhb lim_cur(p, RLIMIT_FSIZE)) { 958125454Sjhb psignal(p, SIGXFSZ); 959125454Sjhb PROC_UNLOCK(p); 960125454Sjhb return (EFBIG); 961125454Sjhb } 96273929Sjhb PROC_UNLOCK(p); 9631541Srgrimes } 96446349Salc 9659428Sdfr biosize = vp->v_mount->mnt_stat.f_iosize; 966147280Sgreen /* 967147280Sgreen * Find all of this file's B_NEEDCOMMIT buffers. If our writes 968147280Sgreen * would exceed the local maximum per-file write commit size when 969147280Sgreen * combined with those, we must decide whether to flush, 970147280Sgreen * go synchronous, or return error. We don't bother checking 971147280Sgreen * IO_UNIT -- we just make all writes atomic anyway, as there's 972147280Sgreen * no point optimizing for something that really won't ever happen. 973147280Sgreen */ 974147280Sgreen if (!(ioflag & IO_SYNC)) { 975158739Smohans int nflag; 976158739Smohans 977158739Smohans mtx_lock(&np->n_mtx); 978158739Smohans nflag = np->n_flag; 979158739Smohans mtx_unlock(&np->n_mtx); 980147280Sgreen int needrestart = 0; 981147280Sgreen if (nmp->nm_wcommitsize < uio->uio_resid) { 982147280Sgreen /* 983147280Sgreen * If this request could not possibly be completed 984147280Sgreen * without exceeding the maximum outstanding write 985147280Sgreen * commit size, see if we can convert it into a 986147280Sgreen * synchronous write operation. 987147280Sgreen */ 988147280Sgreen if (ioflag & IO_NDELAY) 989147280Sgreen return (EAGAIN); 990147280Sgreen ioflag |= IO_SYNC; 991158739Smohans if (nflag & NMODIFIED) 992147280Sgreen needrestart = 1; 993158739Smohans } else if (nflag & NMODIFIED) { 994147280Sgreen int wouldcommit = 0; 995147280Sgreen BO_LOCK(&vp->v_bufobj); 996147280Sgreen if (vp->v_bufobj.bo_dirty.bv_cnt != 0) { 997147280Sgreen TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, 998147280Sgreen b_bobufs) { 999147280Sgreen if (bp->b_flags & B_NEEDCOMMIT) 1000147280Sgreen wouldcommit += bp->b_bcount; 1001147280Sgreen } 1002147280Sgreen } 1003147280Sgreen BO_UNLOCK(&vp->v_bufobj); 1004147280Sgreen /* 1005147280Sgreen * Since we're not operating synchronously and 1006147280Sgreen * bypassing the buffer cache, we are in a commit 1007147280Sgreen * and holding all of these buffers whether 1008147280Sgreen * transmitted or not. If not limited, this 1009147280Sgreen * will lead to the buffer cache deadlocking, 1010147280Sgreen * as no one else can flush our uncommitted buffers. 1011147280Sgreen */ 1012147280Sgreen wouldcommit += uio->uio_resid; 1013147280Sgreen /* 1014147280Sgreen * If we would initially exceed the maximum 1015147280Sgreen * outstanding write commit size, flush and restart. 1016147280Sgreen */ 1017147280Sgreen if (wouldcommit > nmp->nm_wcommitsize) 1018147280Sgreen needrestart = 1; 1019147280Sgreen } 1020148268Sps if (needrestart) 1021147280Sgreen goto flush_and_restart; 1022147280Sgreen } 102346349Salc 10241541Srgrimes do { 10251541Srgrimes nfsstats.biocache_writes++; 10261541Srgrimes lbn = uio->uio_offset / biosize; 10271541Srgrimes on = uio->uio_offset & (biosize-1); 10281541Srgrimes n = min((unsigned)(biosize - on), uio->uio_resid); 10291541Srgrimesagain: 103046349Salc /* 103146349Salc * Handle direct append and file extension cases, calculate 103246349Salc * unaligned buffer size. 103346349Salc */ 1034158739Smohans mtx_lock(&np->n_mtx); 103546349Salc if (uio->uio_offset == np->n_size && n) { 1036158739Smohans mtx_unlock(&np->n_mtx); 103746349Salc /* 103854605Sdillon * Get the buffer (in its pre-append state to maintain 103954605Sdillon * B_CACHE if it was previously set). Resize the 104054605Sdillon * nfsnode after we have locked the buffer to prevent 104154605Sdillon * readers from reading garbage. 104246349Salc */ 104346349Salc bcount = on; 104483366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 104546349Salc 104654605Sdillon if (bp != NULL) { 104754605Sdillon long save; 104846349Salc 1049158739Smohans mtx_lock(&np->n_mtx); 105054605Sdillon np->n_size = uio->uio_offset + n; 105154605Sdillon np->n_flag |= NMODIFIED; 105254605Sdillon vnode_pager_setsize(vp, np->n_size); 1053158739Smohans mtx_unlock(&np->n_mtx); 105454605Sdillon 105554605Sdillon save = bp->b_flags & B_CACHE; 105654605Sdillon bcount += n; 105754605Sdillon allocbuf(bp, bcount); 105854605Sdillon bp->b_flags |= save; 105954605Sdillon } 106046349Salc } else { 106154605Sdillon /* 106283651Speter * Obtain the locked cache block first, and then 106354605Sdillon * adjust the file's size as appropriate. 106454605Sdillon */ 106554605Sdillon bcount = on + n; 106654605Sdillon if ((off_t)lbn * biosize + bcount < np->n_size) { 106754605Sdillon if ((off_t)(lbn + 1) * biosize < np->n_size) 106854605Sdillon bcount = biosize; 106954605Sdillon else 107054605Sdillon bcount = np->n_size - (off_t)lbn * biosize; 107154605Sdillon } 1072158739Smohans mtx_unlock(&np->n_mtx); 107383366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 1074158739Smohans mtx_lock(&np->n_mtx); 107546349Salc if (uio->uio_offset + n > np->n_size) { 107646349Salc np->n_size = uio->uio_offset + n; 107746349Salc np->n_flag |= NMODIFIED; 107846349Salc vnode_pager_setsize(vp, np->n_size); 107946349Salc } 1080158739Smohans mtx_unlock(&np->n_mtx); 10818692Sdg } 108246349Salc 108354605Sdillon if (!bp) { 1084131691Salfred error = nfs_sigintr(nmp, NULL, td); 1085131691Salfred if (!error) 1086131691Salfred error = EINTR; 108754605Sdillon break; 108854605Sdillon } 108954605Sdillon 109046349Salc /* 109146349Salc * Issue a READ if B_CACHE is not set. In special-append 109246349Salc * mode, B_CACHE is based on the buffer prior to the write 109346349Salc * op and is typically set, avoiding the read. If a read 109446349Salc * is required in special append mode, the server will 109546349Salc * probably send us a short-read since we extended the file 109683651Speter * on our end, resulting in b_resid == 0 and, thusly, 109746349Salc * B_CACHE getting set. 109846349Salc * 109946349Salc * We can also avoid issuing the read if the write covers 110046349Salc * the entire buffer. We have to make sure the buffer state 110146349Salc * is reasonable in this case since we will not be initiating 110246349Salc * I/O. See the comments in kern/vfs_bio.c's getblk() for 110346349Salc * more information. 110446349Salc * 110546349Salc * B_CACHE may also be set due to the buffer being cached 110646349Salc * normally. 110746349Salc */ 110846349Salc 110946349Salc if (on == 0 && n == bcount) { 111046349Salc bp->b_flags |= B_CACHE; 111158934Sphk bp->b_flags &= ~B_INVAL; 111258934Sphk bp->b_ioflags &= ~BIO_ERROR; 11138692Sdg } 111446349Salc 111546349Salc if ((bp->b_flags & B_CACHE) == 0) { 111658345Sphk bp->b_iocmd = BIO_READ; 111746349Salc vfs_busy_pages(bp, 0); 1118134898Sphk error = nfs_doio(vp, bp, cred, td); 111946349Salc if (error) { 112046349Salc brelse(bp); 112154605Sdillon break; 112246349Salc } 112346349Salc } 112484827Sjhb if (bp->b_wcred == NOCRED) 112584827Sjhb bp->b_wcred = crhold(cred); 1126158739Smohans mtx_lock(&np->n_mtx); 11271541Srgrimes np->n_flag |= NMODIFIED; 1128158739Smohans mtx_unlock(&np->n_mtx); 11298692Sdg 113045347Sjulian /* 113154605Sdillon * If dirtyend exceeds file size, chop it down. This should 113254605Sdillon * not normally occur but there is an append race where it 113383651Speter * might occur XXX, so we log it. 113454605Sdillon * 113554605Sdillon * If the chopping creates a reverse-indexed or degenerate 113654605Sdillon * situation with dirtyoff/end, we 0 both of them. 113745347Sjulian */ 113845347Sjulian 113954605Sdillon if (bp->b_dirtyend > bcount) { 1140158739Smohans nfs_printf("NFS append race @%lx:%d\n", 114183651Speter (long)bp->b_blkno * DEV_BSIZE, 114254605Sdillon bp->b_dirtyend - bcount); 114354605Sdillon bp->b_dirtyend = bcount; 114454605Sdillon } 114554605Sdillon 114645347Sjulian if (bp->b_dirtyoff >= bp->b_dirtyend) 114745347Sjulian bp->b_dirtyoff = bp->b_dirtyend = 0; 114831617Sdyson 11491541Srgrimes /* 115031617Sdyson * If the new write will leave a contiguous dirty 115131617Sdyson * area, just update the b_dirtyoff and b_dirtyend, 115231617Sdyson * otherwise force a write rpc of the old dirty area. 115346349Salc * 115483651Speter * While it is possible to merge discontiguous writes due to 115546349Salc * our having a B_CACHE buffer ( and thus valid read data 115683651Speter * for the hole), we don't because it could lead to 115746349Salc * significant cache coherency problems with multiple clients, 115846349Salc * especially if locking is implemented later on. 115946349Salc * 116046349Salc * as an optimization we could theoretically maintain 116146349Salc * a linked list of discontinuous areas, but we would still 116246349Salc * have to commit them separately so there isn't much 116346349Salc * advantage to it except perhaps a bit of asynchronization. 116431617Sdyson */ 116542957Sdillon 116631617Sdyson if (bp->b_dirtyend > 0 && 116731617Sdyson (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1168126853Sphk if (bwrite(bp) == EINTR) { 1169100194Sdillon error = EINTR; 1170100194Sdillon break; 1171100194Sdillon } 117231617Sdyson goto again; 117331617Sdyson } 117431617Sdyson 11753305Sphk error = uiomove((char *)bp->b_data + on, n, uio); 117654480Sdillon 117754480Sdillon /* 117854480Sdillon * Since this block is being modified, it must be written 117954480Sdillon * again and not just committed. Since write clustering does 118054480Sdillon * not work for the stage 1 data write, only the stage 2 118154480Sdillon * commit rpc, we have to clear B_CLUSTEROK as well. 118254480Sdillon */ 118354480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 118454480Sdillon 11853305Sphk if (error) { 118658934Sphk bp->b_ioflags |= BIO_ERROR; 11871541Srgrimes brelse(bp); 118854605Sdillon break; 11891541Srgrimes } 119034206Sdyson 119134206Sdyson /* 119283651Speter * Only update dirtyoff/dirtyend if not a degenerate 119345347Sjulian * condition. 119445347Sjulian */ 119545347Sjulian if (n) { 119645347Sjulian if (bp->b_dirtyend > 0) { 119745347Sjulian bp->b_dirtyoff = min(on, bp->b_dirtyoff); 119845347Sjulian bp->b_dirtyend = max((on + n), bp->b_dirtyend); 119945347Sjulian } else { 120045347Sjulian bp->b_dirtyoff = on; 120145347Sjulian bp->b_dirtyend = on + n; 120245347Sjulian } 120346349Salc vfs_bio_set_validclean(bp, on, n); 12041541Srgrimes } 120545347Sjulian 120644679Sjulian /* 120783651Speter * If IO_SYNC do bwrite(). 120846349Salc * 120946349Salc * IO_INVAL appears to be unused. The idea appears to be 121046349Salc * to turn off caching in this case. Very odd. XXX 12111541Srgrimes */ 121283651Speter if ((ioflag & IO_SYNC)) { 121334206Sdyson if (ioflag & IO_INVAL) 121446349Salc bp->b_flags |= B_NOCACHE; 1215126853Sphk error = bwrite(bp); 12163305Sphk if (error) 121754605Sdillon break; 121883651Speter } else if ((n + on) == biosize) { 12199336Sdfr bp->b_flags |= B_ASYNC; 1220158739Smohans (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL); 122146349Salc } else { 12221541Srgrimes bdwrite(bp); 122346349Salc } 12241541Srgrimes } while (uio->uio_resid > 0 && n > 0); 122554605Sdillon 122654605Sdillon return (error); 12271541Srgrimes} 12281541Srgrimes 12291541Srgrimes/* 12301541Srgrimes * Get an nfs cache block. 123154480Sdillon * 12321541Srgrimes * Allocate a new one if the block isn't currently in the cache 12331541Srgrimes * and return the block marked busy. If the calling process is 12341541Srgrimes * interrupted by a signal for an interruptible mount point, return 12351541Srgrimes * NULL. 123654480Sdillon * 123754480Sdillon * The caller must carefully deal with the possible B_INVAL state of 123854480Sdillon * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 123954480Sdillon * indirectly), so synchronous reads can be issued without worrying about 124054480Sdillon * the B_INVAL state. We have to be a little more careful when dealing 124154480Sdillon * with writes (see comments in nfs_write()) when extending a file past 124254480Sdillon * its EOF. 12431541Srgrimes */ 124412911Sphkstatic struct buf * 124583651Speternfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) 12461541Srgrimes{ 124783651Speter struct buf *bp; 124832755Sdyson struct mount *mp; 124932755Sdyson struct nfsmount *nmp; 12501541Srgrimes 125132755Sdyson mp = vp->v_mount; 125232755Sdyson nmp = VFSTONFS(mp); 125332755Sdyson 12541541Srgrimes if (nmp->nm_flag & NFSMNT_INT) { 1255138496Sps sigset_t oldset; 1256138496Sps 1257138496Sps nfs_set_sigmask(td, &oldset); 1258111856Sjeff bp = getblk(vp, bn, size, PCATCH, 0, 0); 1259138496Sps nfs_restore_sigmask(td, &oldset); 126099797Sdillon while (bp == NULL) { 126199797Sdillon if (nfs_sigintr(nmp, NULL, td)) 126299797Sdillon return (NULL); 1263111856Sjeff bp = getblk(vp, bn, size, 0, 2 * hz, 0); 12641541Srgrimes } 126546349Salc } else { 1266111856Sjeff bp = getblk(vp, bn, size, 0, 0, 0); 126746349Salc } 12685455Sdg 126941791Sdt if (vp->v_type == VREG) { 127032755Sdyson int biosize; 127146349Salc 127232755Sdyson biosize = mp->mnt_stat.f_iosize; 127341791Sdt bp->b_blkno = bn * (biosize / DEV_BSIZE); 127432755Sdyson } 12751541Srgrimes return (bp); 12761541Srgrimes} 12771541Srgrimes 12781541Srgrimes/* 12791541Srgrimes * Flush and invalidate all dirty buffers. If another process is already 12801541Srgrimes * doing the flush, just wait for completion. 12811541Srgrimes */ 12821549Srgrimesint 1283140731Sphknfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg) 12841541Srgrimes{ 128583651Speter struct nfsnode *np = VTONFS(vp); 12861541Srgrimes struct nfsmount *nmp = VFSTONFS(vp->v_mount); 12871541Srgrimes int error = 0, slpflag, slptimeo; 1288138469Sps int old_lock = 0; 12891541Srgrimes 1290115041Srwatson ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf"); 1291115041Srwatson 1292120730Sjeff /* 1293120730Sjeff * XXX This check stops us from needlessly doing a vinvalbuf when 1294120730Sjeff * being called through vclean(). It is not clear that this is 1295120730Sjeff * unsafe. 1296120730Sjeff */ 1297143510Sjeff if (vp->v_iflag & VI_DOOMED) 129832755Sdyson return (0); 129932755Sdyson 13001541Srgrimes if ((nmp->nm_flag & NFSMNT_INT) == 0) 13011541Srgrimes intrflg = 0; 13021541Srgrimes if (intrflg) { 13031541Srgrimes slpflag = PCATCH; 13041541Srgrimes slptimeo = 2 * hz; 13051541Srgrimes } else { 13061541Srgrimes slpflag = 0; 13071541Srgrimes slptimeo = 0; 13081541Srgrimes } 13091541Srgrimes 1310176134Sattilio old_lock = nfs_upgrade_vnlock(vp); 13111541Srgrimes /* 13121541Srgrimes * Now, flush as required. 13131541Srgrimes */ 1314158906Sups if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) { 1315158915Sups VM_OBJECT_LOCK(vp->v_bufobj.bo_object); 1316158906Sups vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); 1317158915Sups VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object); 1318158906Sups /* 1319158906Sups * If the page clean was interrupted, fail the invalidation. 1320158906Sups * Not doing so, we run the risk of losing dirty pages in the 1321158906Sups * vinvalbuf() call below. 1322158906Sups */ 1323158906Sups if (intrflg && (error = nfs_sigintr(nmp, NULL, td))) 1324158906Sups goto out; 1325158906Sups } 1326158906Sups 1327183754Sattilio error = vinvalbuf(vp, flags, slpflag, 0); 13281541Srgrimes while (error) { 1329138469Sps if (intrflg && (error = nfs_sigintr(nmp, NULL, td))) 1330138469Sps goto out; 1331183754Sattilio error = vinvalbuf(vp, flags, 0, slptimeo); 13321541Srgrimes } 1333158739Smohans mtx_lock(&np->n_mtx); 1334157557Smohans if (np->n_directio_asyncwr == 0) 1335157557Smohans np->n_flag &= ~NMODIFIED; 1336158739Smohans mtx_unlock(&np->n_mtx); 1337138469Spsout: 1338176134Sattilio nfs_downgrade_vnlock(vp, old_lock); 1339138469Sps return error; 13401541Srgrimes} 13411541Srgrimes 13421541Srgrimes/* 13431541Srgrimes * Initiate asynchronous I/O. Return an error if no nfsiods are available. 13441541Srgrimes * This is mainly to avoid queueing async I/O requests when the nfsiods 13451541Srgrimes * are all hung on a dead server. 134646349Salc * 134758934Sphk * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 134846349Salc * is eventually dequeued by the async daemon, nfs_doio() *will*. 13491541Srgrimes */ 13501549Srgrimesint 1351134898Sphknfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td) 13521541Srgrimes{ 135389324Speter int iod; 135419449Sdfr int gotiod; 135519449Sdfr int slpflag = 0; 135619449Sdfr int slptimeo = 0; 1357131691Salfred int error, error2; 13581541Srgrimes 135955431Sdillon /* 136083651Speter * Commits are usually short and sweet so lets save some cpu and 136155431Sdillon * leave the async daemons for more important rpc's (such as reads 136255431Sdillon * and writes). 136355431Sdillon */ 1364158739Smohans mtx_lock(&nfs_iod_mtx); 136558345Sphk if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 136655431Sdillon (nmp->nm_bufqiods > nfs_numasync / 2)) { 1367158739Smohans mtx_unlock(&nfs_iod_mtx); 136855431Sdillon return(EIO); 136955431Sdillon } 137019449Sdfragain: 137119449Sdfr if (nmp->nm_flag & NFSMNT_INT) 137219449Sdfr slpflag = PCATCH; 137319449Sdfr gotiod = FALSE; 137419449Sdfr 137519449Sdfr /* 137619449Sdfr * Find a free iod to process this request. 137719449Sdfr */ 137889407Speter for (iod = 0; iod < nfs_numasync; iod++) 137989324Speter if (nfs_iodwant[iod]) { 138019449Sdfr gotiod = TRUE; 138125023Sdfr break; 138219449Sdfr } 138319449Sdfr 138419449Sdfr /* 138589324Speter * Try to create one if none are free. 138689324Speter */ 138789324Speter if (!gotiod) { 138889324Speter iod = nfs_nfsiodnew(); 138989324Speter if (iod != -1) 139089324Speter gotiod = TRUE; 139189324Speter } 139289324Speter 139389407Speter if (gotiod) { 139489407Speter /* 139589407Speter * Found one, so wake it up and tell it which 139689407Speter * mount to process. 139789407Speter */ 139889407Speter NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", 139989407Speter iod, nmp)); 140099797Sdillon nfs_iodwant[iod] = NULL; 140189407Speter nfs_iodmount[iod] = nmp; 140289407Speter nmp->nm_bufqiods++; 1403111748Sdes wakeup(&nfs_iodwant[iod]); 140489407Speter } 140589407Speter 140689324Speter /* 140719449Sdfr * If none are free, we may already have an iod working on this mount 140819449Sdfr * point. If so, it will process our request. 140919449Sdfr */ 141019449Sdfr if (!gotiod) { 141119449Sdfr if (nmp->nm_bufqiods > 0) { 141219449Sdfr NFS_DPF(ASYNCIO, 141319449Sdfr ("nfs_asyncio: %d iods are already processing mount %p\n", 141419449Sdfr nmp->nm_bufqiods, nmp)); 141519449Sdfr gotiod = TRUE; 141619449Sdfr } 141719449Sdfr } 141819449Sdfr 141919449Sdfr /* 142019449Sdfr * If we have an iod which can process the request, then queue 142119449Sdfr * the buffer. 142219449Sdfr */ 142319449Sdfr if (gotiod) { 142419449Sdfr /* 142555431Sdillon * Ensure that the queue never grows too large. We still want 142655431Sdillon * to asynchronize so we block rather then return EIO. 142719449Sdfr */ 142819449Sdfr while (nmp->nm_bufqlen >= 2*nfs_numasync) { 142919449Sdfr NFS_DPF(ASYNCIO, 143019449Sdfr ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 143119449Sdfr nmp->nm_bufqwant = TRUE; 1432158739Smohans error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx, 1433158739Smohans slpflag | PRIBIO, 1434138496Sps "nfsaio", slptimeo); 143519449Sdfr if (error) { 1436131691Salfred error2 = nfs_sigintr(nmp, NULL, td); 1437158739Smohans if (error2) { 1438158739Smohans mtx_unlock(&nfs_iod_mtx); 1439131691Salfred return (error2); 1440158739Smohans } 144119449Sdfr if (slpflag == PCATCH) { 144219449Sdfr slpflag = 0; 144319449Sdfr slptimeo = 2 * hz; 144419449Sdfr } 144519449Sdfr } 144619449Sdfr /* 144719449Sdfr * We might have lost our iod while sleeping, 144819449Sdfr * so check and loop if nescessary. 144919449Sdfr */ 145019449Sdfr if (nmp->nm_bufqiods == 0) { 145119449Sdfr NFS_DPF(ASYNCIO, 145219449Sdfr ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 145319449Sdfr goto again; 145419449Sdfr } 145519449Sdfr } 145619449Sdfr 1457172324Smohans /* We might have lost our nfsiod */ 1458172324Smohans if (nmp->nm_bufqiods == 0) { 1459172324Smohans NFS_DPF(ASYNCIO, 1460172324Smohans ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1461172324Smohans goto again; 1462172324Smohans } 1463172324Smohans 146458345Sphk if (bp->b_iocmd == BIO_READ) { 146584827Sjhb if (bp->b_rcred == NOCRED && cred != NOCRED) 146684827Sjhb bp->b_rcred = crhold(cred); 14671541Srgrimes } else { 146884827Sjhb if (bp->b_wcred == NOCRED && cred != NOCRED) 146984827Sjhb bp->b_wcred = crhold(cred); 14701541Srgrimes } 14718876Srgrimes 1472137846Sjeff if (bp->b_flags & B_REMFREE) 1473137846Sjeff bremfreef(bp); 147448225Smckusick BUF_KERNPROC(bp); 147519449Sdfr TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 147619449Sdfr nmp->nm_bufqlen++; 1477158739Smohans if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { 1478158739Smohans mtx_lock(&(VTONFS(bp->b_vp))->n_mtx); 1479169043Sjhb VTONFS(bp->b_vp)->n_flag |= NMODIFIED; 1480157557Smohans VTONFS(bp->b_vp)->n_directio_asyncwr++; 1481158739Smohans mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx); 1482158739Smohans } 1483158739Smohans mtx_unlock(&nfs_iod_mtx); 14841541Srgrimes return (0); 148519449Sdfr } 14869336Sdfr 1487158739Smohans mtx_unlock(&nfs_iod_mtx); 1488158739Smohans 14899336Sdfr /* 149019449Sdfr * All the iods are busy on other mounts, so return EIO to 149119449Sdfr * force the caller to process the i/o synchronously. 14929336Sdfr */ 149319449Sdfr NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 149419449Sdfr return (EIO); 14951541Srgrimes} 14961541Srgrimes 1497138899Spsvoid 1498138899Spsnfs_doio_directwrite(struct buf *bp) 1499138899Sps{ 1500138899Sps int iomode, must_commit; 1501138899Sps struct uio *uiop = (struct uio *)bp->b_caller1; 1502138899Sps char *iov_base = uiop->uio_iov->iov_base; 1503138899Sps struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount); 1504138899Sps 1505138899Sps iomode = NFSV3WRITE_FILESYNC; 1506138899Sps uiop->uio_td = NULL; /* NULL since we're in nfsiod */ 1507138899Sps (nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit); 1508138899Sps KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write")); 1509138899Sps free(iov_base, M_NFSDIRECTIO); 1510138899Sps free(uiop->uio_iov, M_NFSDIRECTIO); 1511138899Sps free(uiop, M_NFSDIRECTIO); 1512157557Smohans if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { 1513157557Smohans struct nfsnode *np = VTONFS(bp->b_vp); 1514158739Smohans mtx_lock(&np->n_mtx); 1515157557Smohans np->n_directio_asyncwr--; 1516169043Sjhb if (np->n_directio_asyncwr == 0) { 1517169043Sjhb VTONFS(bp->b_vp)->n_flag &= ~NMODIFIED; 1518169043Sjhb if ((np->n_flag & NFSYNCWAIT)) { 1519169043Sjhb np->n_flag &= ~NFSYNCWAIT; 1520169043Sjhb wakeup((caddr_t)&np->n_directio_asyncwr); 1521169043Sjhb } 1522157557Smohans } 1523158739Smohans mtx_unlock(&np->n_mtx); 1524157557Smohans } 1525138899Sps bp->b_vp = NULL; 1526138899Sps relpbuf(bp, &nfs_pbuf_freecnt); 1527138899Sps} 1528138899Sps 15291541Srgrimes/* 15301541Srgrimes * Do an I/O operation to/from a cache block. This may be called 15311541Srgrimes * synchronously or from an nfsiod. 15321541Srgrimes */ 15331541Srgrimesint 1534134898Sphknfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) 15351541Srgrimes{ 153644679Sjulian struct uio *uiop; 15371541Srgrimes struct nfsnode *np; 15381541Srgrimes struct nfsmount *nmp; 153946349Salc int error = 0, iomode, must_commit = 0; 15401541Srgrimes struct uio uio; 15411541Srgrimes struct iovec io; 154283651Speter struct proc *p = td ? td->td_proc : NULL; 1543158739Smohans uint8_t iocmd; 1544158739Smohans 15451541Srgrimes np = VTONFS(vp); 15461541Srgrimes nmp = VFSTONFS(vp->v_mount); 15471541Srgrimes uiop = &uio; 15481541Srgrimes uiop->uio_iov = &io; 15491541Srgrimes uiop->uio_iovcnt = 1; 15501541Srgrimes uiop->uio_segflg = UIO_SYSSPACE; 155183366Sjulian uiop->uio_td = td; 15521541Srgrimes 155346349Salc /* 155458934Sphk * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 155546349Salc * do this here so we do not have to do it in all the code that 155646349Salc * calls us. 155746349Salc */ 155858934Sphk bp->b_flags &= ~B_INVAL; 155958934Sphk bp->b_ioflags &= ~BIO_ERROR; 156046349Salc 156144679Sjulian KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1562158739Smohans iocmd = bp->b_iocmd; 1563158739Smohans if (iocmd == BIO_READ) { 15643664Sphk io.iov_len = uiop->uio_resid = bp->b_bcount; 15653664Sphk io.iov_base = bp->b_data; 15661541Srgrimes uiop->uio_rw = UIO_READ; 156787834Sdillon 15681541Srgrimes switch (vp->v_type) { 15691541Srgrimes case VREG: 15709336Sdfr uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 15711541Srgrimes nfsstats.read_bios++; 1572122953Salfred error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr); 157387834Sdillon 15741541Srgrimes if (!error) { 15751541Srgrimes if (uiop->uio_resid) { 15761541Srgrimes /* 157746349Salc * If we had a short read with no error, we must have 157846349Salc * hit a file hole. We should zero-fill the remainder. 157946349Salc * This can also occur if the server hits the file EOF. 158046349Salc * 158183651Speter * Holes used to be able to occur due to pending 158246349Salc * writes, but that is not possible any longer. 15831541Srgrimes */ 158446349Salc int nread = bp->b_bcount - uiop->uio_resid; 158587834Sdillon int left = uiop->uio_resid; 158646349Salc 158746349Salc if (left > 0) 158846349Salc bzero((char *)bp->b_data + nread, left); 158946349Salc uiop->uio_resid = 0; 159046349Salc } 15911541Srgrimes } 1592115041Srwatson /* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */ 1593158739Smohans if (p && (vp->v_vflag & VV_TEXT)) { 1594158739Smohans mtx_lock(&np->n_mtx); 1595158739Smohans if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) { 1596158739Smohans mtx_unlock(&np->n_mtx); 1597158739Smohans PROC_LOCK(p); 1598158739Smohans killproc(p, "text file modification"); 1599158739Smohans PROC_UNLOCK(p); 1600158739Smohans } else 1601158739Smohans mtx_unlock(&np->n_mtx); 16021541Srgrimes } 16031541Srgrimes break; 16041541Srgrimes case VLNK: 16059336Sdfr uiop->uio_offset = (off_t)0; 16061541Srgrimes nfsstats.readlink_bios++; 1607122953Salfred error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr); 16081541Srgrimes break; 16091541Srgrimes case VDIR: 16101541Srgrimes nfsstats.readdir_bios++; 16119336Sdfr uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1612192578Srwatson if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { 1613192578Srwatson error = nfs_readdirplusrpc(vp, uiop, cr); 1614192578Srwatson if (error == NFSERR_NOTSUPP) 1615192578Srwatson nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 16169336Sdfr } 1617192578Srwatson if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1618192578Srwatson error = nfs_readdirrpc(vp, uiop, cr); 161946349Salc /* 162046349Salc * end-of-directory sets B_INVAL but does not generate an 162146349Salc * error. 162246349Salc */ 162339782Smckusick if (error == 0 && uiop->uio_resid == bp->b_bcount) 162439782Smckusick bp->b_flags |= B_INVAL; 16251541Srgrimes break; 16263305Sphk default: 1627158739Smohans nfs_printf("nfs_doio: type %x unexpected\n", vp->v_type); 16283305Sphk break; 16291541Srgrimes }; 16301541Srgrimes if (error) { 163158934Sphk bp->b_ioflags |= BIO_ERROR; 16321541Srgrimes bp->b_error = error; 16331541Srgrimes } 16341541Srgrimes } else { 163583651Speter /* 163651344Sdillon * If we only need to commit, try to commit 163751344Sdillon */ 163851344Sdillon if (bp->b_flags & B_NEEDCOMMIT) { 163951344Sdillon int retv; 164051344Sdillon off_t off; 164151344Sdillon 164251344Sdillon off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1643122953Salfred retv = (nmp->nm_rpcops->nr_commit)( 1644136927Sphk vp, off, bp->b_dirtyend-bp->b_dirtyoff, 164583366Sjulian bp->b_wcred, td); 164651344Sdillon if (retv == 0) { 164751344Sdillon bp->b_dirtyoff = bp->b_dirtyend = 0; 164854480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 164951344Sdillon bp->b_resid = 0; 165059249Sphk bufdone(bp); 165151344Sdillon return (0); 165251344Sdillon } 165351344Sdillon if (retv == NFSERR_STALEWRITEVERF) { 1654136927Sphk nfs_clearcommit(vp->v_mount); 165551344Sdillon } 165651344Sdillon } 165751344Sdillon 165851344Sdillon /* 165951344Sdillon * Setup for actual write 166051344Sdillon */ 1661158739Smohans mtx_lock(&np->n_mtx); 166241791Sdt if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 166341791Sdt bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1664158739Smohans mtx_unlock(&np->n_mtx); 16658692Sdg 16668692Sdg if (bp->b_dirtyend > bp->b_dirtyoff) { 16678692Sdg io.iov_len = uiop->uio_resid = bp->b_dirtyend 16689336Sdfr - bp->b_dirtyoff; 166941791Sdt uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 16709336Sdfr + bp->b_dirtyoff; 16718692Sdg io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 16728692Sdg uiop->uio_rw = UIO_WRITE; 16738692Sdg nfsstats.write_bios++; 167444679Sjulian 167525785Sdfr if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 16769336Sdfr iomode = NFSV3WRITE_UNSTABLE; 16778692Sdg else 16789336Sdfr iomode = NFSV3WRITE_FILESYNC; 167944679Sjulian 1680122953Salfred error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit); 168151475Sdillon 168251475Sdillon /* 168351475Sdillon * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 168451475Sdillon * to cluster the buffers needing commit. This will allow 168551475Sdillon * the system to submit a single commit rpc for the whole 168683651Speter * cluster. We can do this even if the buffer is not 100% 168754480Sdillon * dirty (relative to the NFS blocksize), so we optimize the 168854480Sdillon * append-to-file-case. 168954480Sdillon * 169054480Sdillon * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 169154480Sdillon * cleared because write clustering only works for commit 169254480Sdillon * rpc's, not for the data portion of the write). 169351475Sdillon */ 169451475Sdillon 169525003Sdfr if (!error && iomode == NFSV3WRITE_UNSTABLE) { 169625003Sdfr bp->b_flags |= B_NEEDCOMMIT; 169725003Sdfr if (bp->b_dirtyoff == 0 169846349Salc && bp->b_dirtyend == bp->b_bcount) 169925003Sdfr bp->b_flags |= B_CLUSTEROK; 170044679Sjulian } else { 170154480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 170244679Sjulian } 17038692Sdg 17049336Sdfr /* 17059336Sdfr * For an interrupted write, the buffer is still valid 17069336Sdfr * and the write hasn't been pushed to the server yet, 170758934Sphk * so we can't set BIO_ERROR and report the interruption 17089336Sdfr * by setting B_EINTR. For the B_ASYNC case, B_EINTR 17099336Sdfr * is not relevant, so the rpc attempt is essentially 17109336Sdfr * a noop. For the case of a V3 write rpc not being 17119336Sdfr * committed to stable storage, the block is still 17129336Sdfr * dirty and requires either a commit rpc or another 17139336Sdfr * write rpc with iomode == NFSV3WRITE_FILESYNC before 17149336Sdfr * the block is reused. This is indicated by setting 17159336Sdfr * the B_DELWRI and B_NEEDCOMMIT flags. 171642957Sdillon * 171742957Sdillon * If the buffer is marked B_PAGING, it does not reside on 171844679Sjulian * the vp's paging queues so we cannot call bdirty(). The 171944679Sjulian * bp in this case is not an NFS cache block so we should 172044679Sjulian * be safe. XXX 1721171189Sjhb * 1722171189Sjhb * The logic below breaks up errors into recoverable and 1723171189Sjhb * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE 1724171189Sjhb * and keep the buffer around for potential write retries. 1725171189Sjhb * For the latter (eg ESTALE), we toss the buffer away (B_INVAL) 1726171189Sjhb * and save the error in the nfsnode. This is less than ideal 1727171189Sjhb * but necessary. Keeping such buffers around could potentially 1728171189Sjhb * cause buffer exhaustion eventually (they can never be written 1729171189Sjhb * out, so will get constantly be re-dirtied). It also causes 1730171189Sjhb * all sorts of vfs panics. For non-recoverable write errors, 1731171189Sjhb * also invalidate the attrcache, so we'll be forced to go over 1732171189Sjhb * the wire for this object, returning an error to user on next 1733171189Sjhb * call (most of the time). 17349336Sdfr */ 1735152656Sps if (error == EINTR || error == EIO || error == ETIMEDOUT 17369336Sdfr || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 173734266Sjulian int s; 173834266Sjulian 173944679Sjulian s = splbio(); 17408692Sdg bp->b_flags &= ~(B_INVAL|B_NOCACHE); 174142957Sdillon if ((bp->b_flags & B_PAGING) == 0) { 174244679Sjulian bdirty(bp); 174344679Sjulian bp->b_flags &= ~B_DONE; 174442957Sdillon } 174547749Speter if (error && (bp->b_flags & B_ASYNC) == 0) 174632755Sdyson bp->b_flags |= B_EINTR; 174744679Sjulian splx(s); 17488692Sdg } else { 174944679Sjulian if (error) { 175058934Sphk bp->b_ioflags |= BIO_ERROR; 1751171189Sjhb bp->b_flags |= B_INVAL; 175244679Sjulian bp->b_error = np->n_error = error; 1753158739Smohans mtx_lock(&np->n_mtx); 175444679Sjulian np->n_flag |= NWRITEERR; 1755171189Sjhb np->n_attrstamp = 0; 1756190380Srwatson KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); 1757158739Smohans mtx_unlock(&np->n_mtx); 175844679Sjulian } 175944679Sjulian bp->b_dirtyoff = bp->b_dirtyend = 0; 17608692Sdg } 17611541Srgrimes } else { 17628692Sdg bp->b_resid = 0; 176359249Sphk bufdone(bp); 17648692Sdg return (0); 17651541Srgrimes } 17661541Srgrimes } 17671541Srgrimes bp->b_resid = uiop->uio_resid; 17689336Sdfr if (must_commit) 176944679Sjulian nfs_clearcommit(vp->v_mount); 177059249Sphk bufdone(bp); 17711541Srgrimes return (error); 17721541Srgrimes} 177387834Sdillon 177487834Sdillon/* 177587834Sdillon * Used to aid in handling ftruncate() operations on the NFS client side. 177687834Sdillon * Truncation creates a number of special problems for NFS. We have to 177787834Sdillon * throw away VM pages and buffer cache buffers that are beyond EOF, and 177887834Sdillon * we have to properly handle VM pages or (potentially dirty) buffers 177987834Sdillon * that straddle the truncation point. 178087834Sdillon */ 178187834Sdillon 178287834Sdillonint 178387834Sdillonnfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) 178487834Sdillon{ 178587834Sdillon struct nfsnode *np = VTONFS(vp); 1786158739Smohans u_quad_t tsize; 178787834Sdillon int biosize = vp->v_mount->mnt_stat.f_iosize; 178887834Sdillon int error = 0; 178987834Sdillon 1790158739Smohans mtx_lock(&np->n_mtx); 1791158739Smohans tsize = np->n_size; 179287834Sdillon np->n_size = nsize; 1793158739Smohans mtx_unlock(&np->n_mtx); 179487834Sdillon 1795158739Smohans if (nsize < tsize) { 179687834Sdillon struct buf *bp; 179787834Sdillon daddr_t lbn; 179887834Sdillon int bufsize; 179987834Sdillon 180087834Sdillon /* 180187834Sdillon * vtruncbuf() doesn't get the buffer overlapping the 180287834Sdillon * truncation point. We may have a B_DELWRI and/or B_CACHE 180387834Sdillon * buffer that now needs to be truncated. 180487834Sdillon */ 180587834Sdillon error = vtruncbuf(vp, cred, td, nsize, biosize); 180687834Sdillon lbn = nsize / biosize; 180787834Sdillon bufsize = nsize & (biosize - 1); 180887834Sdillon bp = nfs_getcacheblk(vp, lbn, bufsize, td); 1809138496Sps if (!bp) 1810138496Sps return EINTR; 181187834Sdillon if (bp->b_dirtyoff > bp->b_bcount) 181287834Sdillon bp->b_dirtyoff = bp->b_bcount; 181387834Sdillon if (bp->b_dirtyend > bp->b_bcount) 181487834Sdillon bp->b_dirtyend = bp->b_bcount; 181587834Sdillon bp->b_flags |= B_RELBUF; /* don't leave garbage around */ 181687834Sdillon brelse(bp); 181787834Sdillon } else { 181887834Sdillon vnode_pager_setsize(vp, nsize); 181987834Sdillon } 182087834Sdillon return(error); 182187834Sdillon} 182287834Sdillon 1823