vfs_cluster.c revision 60041
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 3650477Speter * $FreeBSD: head/sys/kern/vfs_cluster.c 60041 2000-05-05 09:59:14Z phk $ 371541Srgrimes */ 381541Srgrimes 3932929Seivind#include "opt_debug_cluster.h" 4032929Seivind 411541Srgrimes#include <sys/param.h> 421549Srgrimes#include <sys/systm.h> 4341168Sbde#include <sys/kernel.h> 441541Srgrimes#include <sys/proc.h> 4560041Sphk#include <sys/bio.h> 461541Srgrimes#include <sys/buf.h> 471541Srgrimes#include <sys/vnode.h> 4841124Sdg#include <sys/malloc.h> 491541Srgrimes#include <sys/mount.h> 501541Srgrimes#include <sys/resourcevar.h> 516621Sdg#include <vm/vm.h> 5210541Sdyson#include <vm/vm_object.h> 5310541Sdyson#include <vm/vm_page.h> 5448545Smckusick#include <sys/sysctl.h> 551541Srgrimes 5621002Sdyson#if defined(CLUSTERDEBUG) 5721002Sdyson#include <sys/sysctl.h> 5821002Sdysonstatic int rcluster= 0; 5924484SbdeSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 6021002Sdyson#endif 6121002Sdyson 6241124Sdgstatic MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 6341124Sdg 6412973Sbdestatic struct cluster_save * 6512973Sbde cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 6612973Sbdestatic struct buf * 6712973Sbde cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 6821002Sdyson daddr_t blkno, long size, int run, struct buf *fbp)); 691541Srgrimes 7048545Smckusickstatic int write_behind = 1; 7148545SmckusickSYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, ""); 7248545Smckusick 7312973Sbdeextern vm_page_t bogus_page; 745455Sdg 7542957Sdillonextern int cluster_pbuf_freecnt; 7642957Sdillon 771541Srgrimes/* 7821002Sdyson * Maximum number of blocks for read-ahead. 791541Srgrimes */ 8021002Sdyson#define MAXRA 32 815455Sdg 821541Srgrimes/* 8321002Sdyson * This replaces bread. 8410541Sdyson */ 851549Srgrimesint 8621002Sdysoncluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 871541Srgrimes struct vnode *vp; 881541Srgrimes u_quad_t filesize; 891541Srgrimes daddr_t lblkno; 901541Srgrimes long size; 911541Srgrimes struct ucred *cred; 9221002Sdyson long totread; 9321002Sdyson int seqcount; 941541Srgrimes struct buf **bpp; 951541Srgrimes{ 9621002Sdyson struct buf *bp, *rbp, *reqbp; 9731016Sphk daddr_t blkno, origblkno; 9821002Sdyson int error, num_ra; 9910541Sdyson int i; 10021002Sdyson int maxra, racluster; 10121002Sdyson long origtotread; 1021541Srgrimes 1031541Srgrimes error = 0; 10421002Sdyson 1055455Sdg /* 10621002Sdyson * Try to limit the amount of read-ahead by a few 10721002Sdyson * ad-hoc parameters. This needs work!!! 10821002Sdyson */ 10951797Sphk racluster = vp->v_mount->mnt_iosize_max / size; 11021002Sdyson maxra = 2 * racluster + (totread / size); 11121002Sdyson if (maxra > MAXRA) 11221002Sdyson maxra = MAXRA; 11321002Sdyson if (maxra > nbuf/8) 11421002Sdyson maxra = nbuf/8; 11521002Sdyson 11621002Sdyson /* 1175455Sdg * get the requested block 1185455Sdg */ 11921002Sdyson *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 12021002Sdyson origblkno = lblkno; 12121002Sdyson origtotread = totread; 12212767Sdyson 1235455Sdg /* 1245455Sdg * if it is in the cache, then check to see if the reads have been 1255455Sdg * sequential. If they have, then try some read-ahead, otherwise 1265455Sdg * back-off on prospective read-aheads. 1275455Sdg */ 1281541Srgrimes if (bp->b_flags & B_CACHE) { 12921002Sdyson if (!seqcount) { 1305455Sdg return 0; 13121002Sdyson } else if ((bp->b_flags & B_RAM) == 0) { 13221002Sdyson return 0; 13321002Sdyson } else { 13421002Sdyson int s; 13521002Sdyson struct buf *tbp; 13621002Sdyson bp->b_flags &= ~B_RAM; 13721002Sdyson /* 13821002Sdyson * We do the spl here so that there is no window 13921002Sdyson * between the incore and the b_usecount increment 14021002Sdyson * below. We opt to keep the spl out of the loop 14121002Sdyson * for efficiency. 14221002Sdyson */ 14321002Sdyson s = splbio(); 14448225Smckusick for (i = 1; i < maxra; i++) { 14521002Sdyson 14621002Sdyson if (!(tbp = incore(vp, lblkno+i))) { 14721002Sdyson break; 14821002Sdyson } 14921002Sdyson 15021002Sdyson /* 15148677Smckusick * Set another read-ahead mark so we know 15248677Smckusick * to check again. 15321002Sdyson */ 15421002Sdyson if (((i % racluster) == (racluster - 1)) || 15521002Sdyson (i == (maxra - 1))) 15621002Sdyson tbp->b_flags |= B_RAM; 15721002Sdyson } 15821002Sdyson splx(s); 15921002Sdyson if (i >= maxra) { 1605839Sdg return 0; 16110541Sdyson } 16221002Sdyson lblkno += i; 16321002Sdyson } 16421002Sdyson reqbp = bp = NULL; 16521002Sdyson } else { 16642453Seivind off_t firstread = bp->b_offset; 16742453Seivind 16842408Seivind KASSERT(bp->b_offset != NOOFFSET, 16942453Seivind ("cluster_read: no buffer offset")); 17021002Sdyson if (firstread + totread > filesize) 17121002Sdyson totread = filesize - firstread; 17221002Sdyson if (totread > size) { 17321002Sdyson int nblks = 0; 17421002Sdyson int ncontigafter; 17521002Sdyson while (totread > 0) { 17621002Sdyson nblks++; 17721002Sdyson totread -= size; 17821002Sdyson } 17921002Sdyson if (nblks == 1) 18021002Sdyson goto single_block_read; 18121002Sdyson if (nblks > racluster) 18221002Sdyson nblks = racluster; 18321002Sdyson 18421002Sdyson error = VOP_BMAP(vp, lblkno, NULL, 18521002Sdyson &blkno, &ncontigafter, NULL); 18621002Sdyson if (error) 18721002Sdyson goto single_block_read; 18821002Sdyson if (blkno == -1) 18921002Sdyson goto single_block_read; 19021002Sdyson if (ncontigafter == 0) 19121002Sdyson goto single_block_read; 19221002Sdyson if (ncontigafter + 1 < nblks) 19321002Sdyson nblks = ncontigafter + 1; 19421002Sdyson 19521002Sdyson bp = cluster_rbuild(vp, filesize, lblkno, 19621002Sdyson blkno, size, nblks, bp); 19734694Sdyson lblkno += (bp->b_bufsize / size); 19810541Sdyson } else { 19921002Sdysonsingle_block_read: 20021002Sdyson /* 20121002Sdyson * if it isn't in the cache, then get a chunk from 20221002Sdyson * disk if sequential, otherwise just get the block. 20321002Sdyson */ 20458345Sphk bp->b_flags |= B_RAM; 20558345Sphk bp->b_iocmd = BIO_READ; 20610541Sdyson lblkno += 1; 2078876Srgrimes } 2081541Srgrimes } 2095455Sdg 2105455Sdg /* 2115455Sdg * if we have been doing sequential I/O, then do some read-ahead 2125455Sdg */ 21321002Sdyson rbp = NULL; 21421002Sdyson if (seqcount && (lblkno < (origblkno + seqcount))) { 2151541Srgrimes /* 21621002Sdyson * we now build the read-ahead buffer if it is desirable. 2171541Srgrimes */ 21821002Sdyson if (((u_quad_t)(lblkno + 1) * size) <= filesize && 21921002Sdyson !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 22021002Sdyson blkno != -1) { 22121002Sdyson int nblksread; 22221002Sdyson int ntoread = num_ra + 1; 22321002Sdyson nblksread = (origtotread + size - 1) / size; 22421002Sdyson if (seqcount < nblksread) 22521002Sdyson seqcount = nblksread; 22621002Sdyson if (seqcount < ntoread) 22721002Sdyson ntoread = seqcount; 22821002Sdyson if (num_ra) { 22921002Sdyson rbp = cluster_rbuild(vp, filesize, lblkno, 23021002Sdyson blkno, size, ntoread, NULL); 23121002Sdyson } else { 23221002Sdyson rbp = getblk(vp, lblkno, size, 0, 0); 23358345Sphk rbp->b_flags |= B_ASYNC | B_RAM; 23458345Sphk rbp->b_iocmd = BIO_READ; 23521002Sdyson rbp->b_blkno = blkno; 2365455Sdg } 2371541Srgrimes } 2385455Sdg } 2391541Srgrimes 2405455Sdg /* 24110541Sdyson * handle the synchronous read 2425455Sdg */ 2435455Sdg if (bp) { 24421002Sdyson#if defined(CLUSTERDEBUG) 24536275Sdyson if (rcluster) 24637951Sbde printf("S(%ld,%ld,%d) ", 24737951Sbde (long)bp->b_lblkno, bp->b_bcount, seqcount); 24821002Sdyson#endif 24936275Sdyson if ((bp->b_flags & B_CLUSTER) == 0) 25036275Sdyson vfs_busy_pages(bp, 0); 25158934Sphk bp->b_flags &= ~B_INVAL; 25258934Sphk bp->b_ioflags &= ~BIO_ERROR; 25358345Sphk if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) 25448333Speter BUF_KERNPROC(bp); 25537384Sjulian error = VOP_STRATEGY(vp, bp); 25636275Sdyson curproc->p_stats->p_ru.ru_inblock++; 2575455Sdg } 25834611Sdyson 2595455Sdg /* 2605455Sdg * and if we have read-aheads, do them too 2615455Sdg */ 2625455Sdg if (rbp) { 26313490Sdyson if (error) { 26458345Sphk rbp->b_flags &= ~B_ASYNC; 2651541Srgrimes brelse(rbp); 26613490Sdyson } else if (rbp->b_flags & B_CACHE) { 26758345Sphk rbp->b_flags &= ~B_ASYNC; 26813490Sdyson bqrelse(rbp); 2695455Sdg } else { 27021002Sdyson#if defined(CLUSTERDEBUG) 27121002Sdyson if (rcluster) { 27221002Sdyson if (bp) 27337951Sbde printf("A+(%ld,%ld,%ld,%d) ", 27437951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27537951Sbde (long)(rbp->b_lblkno - origblkno), 27637951Sbde seqcount); 27721002Sdyson else 27837951Sbde printf("A(%ld,%ld,%ld,%d) ", 27937951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 28037951Sbde (long)(rbp->b_lblkno - origblkno), 28137951Sbde seqcount); 28221002Sdyson } 28321002Sdyson#endif 28421002Sdyson 28510541Sdyson if ((rbp->b_flags & B_CLUSTER) == 0) 28610541Sdyson vfs_busy_pages(rbp, 0); 28758934Sphk rbp->b_flags &= ~B_INVAL; 28858934Sphk rbp->b_ioflags &= ~BIO_ERROR; 28958345Sphk if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) 29048333Speter BUF_KERNPROC(rbp); 29137384Sjulian (void) VOP_STRATEGY(vp, rbp); 2925455Sdg curproc->p_stats->p_ru.ru_inblock++; 2935455Sdg } 2945455Sdg } 29521002Sdyson if (reqbp) 29659762Sphk return (bufwait(reqbp)); 29721002Sdyson else 29821002Sdyson return (error); 2991541Srgrimes} 3001541Srgrimes 3011541Srgrimes/* 3021541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 3031541Srgrimes * read ahead. We will read as many blocks as possible sequentially 3041541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 3051541Srgrimes */ 30610541Sdysonstatic struct buf * 30721002Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 3081541Srgrimes struct vnode *vp; 3091541Srgrimes u_quad_t filesize; 3101541Srgrimes daddr_t lbn; 3111541Srgrimes daddr_t blkno; 3121541Srgrimes long size; 3131541Srgrimes int run; 31421002Sdyson struct buf *fbp; 3151541Srgrimes{ 31610541Sdyson struct buf *bp, *tbp; 3171541Srgrimes daddr_t bn; 31840648Sphk int i, inc, j; 3191541Srgrimes 32042408Seivind KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 32142453Seivind ("cluster_rbuild: size %ld != filesize %ld\n", 32242453Seivind size, vp->v_mount->mnt_stat.f_iosize)); 32342453Seivind 32412767Sdyson /* 32512767Sdyson * avoid a division 32612767Sdyson */ 32712767Sdyson while ((u_quad_t) size * (lbn + run) > filesize) { 3281541Srgrimes --run; 32912767Sdyson } 33010541Sdyson 33121002Sdyson if (fbp) { 33221002Sdyson tbp = fbp; 33358345Sphk tbp->b_iocmd = BIO_READ; 33421002Sdyson } else { 33521002Sdyson tbp = getblk(vp, lbn, size, 0, 0); 33621002Sdyson if (tbp->b_flags & B_CACHE) 33721002Sdyson return tbp; 33858345Sphk tbp->b_flags |= B_ASYNC | B_RAM; 33958345Sphk tbp->b_iocmd = BIO_READ; 34021002Sdyson } 34110541Sdyson 34210541Sdyson tbp->b_blkno = blkno; 34316086Sdyson if( (tbp->b_flags & B_MALLOC) || 34416086Sdyson ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 34510541Sdyson return tbp; 34610541Sdyson 34742957Sdillon bp = trypbuf(&cluster_pbuf_freecnt); 34810541Sdyson if (bp == 0) 34910541Sdyson return tbp; 35010541Sdyson 35137467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 35237467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 35358345Sphk bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; 35458345Sphk bp->b_iocmd = BIO_READ; 3555455Sdg bp->b_iodone = cluster_callback; 3565455Sdg bp->b_blkno = blkno; 3575455Sdg bp->b_lblkno = lbn; 35834611Sdyson bp->b_offset = tbp->b_offset; 35942453Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 3605455Sdg pbgetvp(vp, bp); 3611541Srgrimes 36212404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 3631541Srgrimes 3645455Sdg bp->b_bcount = 0; 3655455Sdg bp->b_bufsize = 0; 3665455Sdg bp->b_npages = 0; 3675455Sdg 3681541Srgrimes inc = btodb(size); 36910541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3705455Sdg if (i != 0) { 37112767Sdyson if ((bp->b_npages * PAGE_SIZE) + 37251797Sphk round_page(size) > vp->v_mount->mnt_iosize_max) 37310541Sdyson break; 37410978Sdyson 37543301Sdillon if ((tbp = incore(vp, lbn + i)) != NULL) { 37648225Smckusick if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) 37734611Sdyson break; 37848225Smckusick BUF_UNLOCK(tbp); 37912767Sdyson 38034611Sdyson for (j = 0; j < tbp->b_npages; j++) 38134611Sdyson if (tbp->b_pages[j]->valid) 38234611Sdyson break; 38334611Sdyson 38434611Sdyson if (j != tbp->b_npages) 38534611Sdyson break; 38634611Sdyson 38734611Sdyson if (tbp->b_bcount != size) 38834611Sdyson break; 38934611Sdyson } 39034611Sdyson 3915455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 39210541Sdyson 3935455Sdg if ((tbp->b_flags & B_CACHE) || 39410541Sdyson (tbp->b_flags & B_VMIO) == 0) { 39513490Sdyson bqrelse(tbp); 3965455Sdg break; 3975455Sdg } 39810541Sdyson 39934611Sdyson for (j = 0;j < tbp->b_npages; j++) 40034611Sdyson if (tbp->b_pages[j]->valid) 40110541Sdyson break; 40210541Sdyson 40310541Sdyson if (j != tbp->b_npages) { 40434611Sdyson bqrelse(tbp); 40510541Sdyson break; 40610541Sdyson } 40710541Sdyson 40821002Sdyson if ((fbp && (i == 1)) || (i == (run - 1))) 40921002Sdyson tbp->b_flags |= B_RAM; 41058345Sphk tbp->b_flags |= B_ASYNC; 41158345Sphk tbp->b_iocmd = BIO_READ; 41212767Sdyson if (tbp->b_blkno == tbp->b_lblkno) { 41310541Sdyson tbp->b_blkno = bn; 41410541Sdyson } else if (tbp->b_blkno != bn) { 41510541Sdyson brelse(tbp); 41610541Sdyson break; 41710541Sdyson } 4181541Srgrimes } 41948333Speter /* 42048333Speter * XXX fbp from caller may not be B_ASYNC, but we are going 42148333Speter * to biodone() it in cluster_callback() anyway 42248333Speter */ 42348333Speter BUF_KERNPROC(tbp); 42412404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 42512404Sdyson tbp, b_cluster.cluster_entry); 4265455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 42710541Sdyson vm_page_t m; 42810541Sdyson m = tbp->b_pages[j]; 42938799Sdfr vm_page_io_start(m); 43038517Sdfr vm_object_pip_add(m->object, 1); 43110541Sdyson if ((bp->b_npages == 0) || 43212413Sdyson (bp->b_pages[bp->b_npages-1] != m)) { 43310541Sdyson bp->b_pages[bp->b_npages] = m; 43410541Sdyson bp->b_npages++; 43510541Sdyson } 43618737Sdyson if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 43718737Sdyson tbp->b_pages[j] = bogus_page; 4381541Srgrimes } 43910541Sdyson bp->b_bcount += tbp->b_bcount; 44010541Sdyson bp->b_bufsize += tbp->b_bufsize; 4411541Srgrimes } 44218737Sdyson 44318737Sdyson for(j=0;j<bp->b_npages;j++) { 44418737Sdyson if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 44518737Sdyson VM_PAGE_BITS_ALL) 44618737Sdyson bp->b_pages[j] = bogus_page; 44718737Sdyson } 44820054Sdyson if (bp->b_bufsize > bp->b_kvasize) 44937559Sbde panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 45037559Sbde bp->b_bufsize, bp->b_kvasize); 45120054Sdyson bp->b_kvasize = bp->b_bufsize; 45218737Sdyson 45310541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 45410541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 4555455Sdg return (bp); 4561541Srgrimes} 4571541Srgrimes 4581541Srgrimes/* 4591541Srgrimes * Cleanup after a clustered read or write. 4601541Srgrimes * This is complicated by the fact that any of the buffers might have 4611541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 4621541Srgrimes * that we will need to shift around. 4631541Srgrimes */ 4641541Srgrimesvoid 4651541Srgrimescluster_callback(bp) 4661541Srgrimes struct buf *bp; 4671541Srgrimes{ 46812404Sdyson struct buf *nbp, *tbp; 4691541Srgrimes int error = 0; 4701541Srgrimes 4711541Srgrimes /* 4721541Srgrimes * Must propogate errors to all the components. 4731541Srgrimes */ 47458934Sphk if (bp->b_ioflags & BIO_ERROR) 4751541Srgrimes error = bp->b_error; 4761541Srgrimes 47710541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 4781541Srgrimes /* 4791541Srgrimes * Move memory from the large cluster buffer into the component 4801541Srgrimes * buffers and mark IO as done on these. 4811541Srgrimes */ 48221002Sdyson for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 48312404Sdyson tbp; tbp = nbp) { 48421002Sdyson nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 4851541Srgrimes if (error) { 48658934Sphk tbp->b_ioflags |= BIO_ERROR; 4871541Srgrimes tbp->b_error = error; 48846349Salc } else { 48946349Salc tbp->b_dirtyoff = tbp->b_dirtyend = 0; 49058934Sphk tbp->b_flags &= ~B_INVAL; 49158934Sphk tbp->b_ioflags &= ~BIO_ERROR; 49246349Salc } 49359249Sphk bufdone(tbp); 4941541Srgrimes } 49542957Sdillon relpbuf(bp, &cluster_pbuf_freecnt); 4961541Srgrimes} 4971541Srgrimes 4981541Srgrimes/* 49948545Smckusick * cluster_wbuild_wb: 50048545Smckusick * 50148545Smckusick * Implement modified write build for cluster. 50248545Smckusick * 50348545Smckusick * write_behind = 0 write behind disabled 50448545Smckusick * write_behind = 1 write behind normal (default) 50548545Smckusick * write_behind = 2 write behind backed-off 50648545Smckusick */ 50748545Smckusick 50848545Smckusickstatic __inline int 50948545Smckusickcluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) 51048545Smckusick{ 51148545Smckusick int r = 0; 51248545Smckusick 51348545Smckusick switch(write_behind) { 51448545Smckusick case 2: 51548545Smckusick if (start_lbn < len) 51648545Smckusick break; 51748545Smckusick start_lbn -= len; 51848545Smckusick /* fall through */ 51948545Smckusick case 1: 52048545Smckusick r = cluster_wbuild(vp, size, start_lbn, len); 52148545Smckusick /* fall through */ 52248545Smckusick default: 52348545Smckusick /* fall through */ 52448545Smckusick break; 52548545Smckusick } 52648545Smckusick return(r); 52748545Smckusick} 52848545Smckusick 52948545Smckusick/* 5301541Srgrimes * Do clustered write for FFS. 5311541Srgrimes * 5321541Srgrimes * Three cases: 5331541Srgrimes * 1. Write is not sequential (write asynchronously) 5341541Srgrimes * Write is sequential: 5351541Srgrimes * 2. beginning of cluster - begin cluster 5361541Srgrimes * 3. middle of a cluster - add to cluster 5371541Srgrimes * 4. end of a cluster - asynchronously write cluster 5381541Srgrimes */ 5391541Srgrimesvoid 54058909Sdilloncluster_write(bp, filesize, seqcount) 5415455Sdg struct buf *bp; 5421541Srgrimes u_quad_t filesize; 54358909Sdillon int seqcount; 5441541Srgrimes{ 5455455Sdg struct vnode *vp; 5465455Sdg daddr_t lbn; 5475455Sdg int maxclen, cursize; 5485455Sdg int lblocksize; 54912404Sdyson int async; 5501541Srgrimes 5515455Sdg vp = bp->b_vp; 55232286Sdyson if (vp->v_type == VREG) { 55332286Sdyson async = vp->v_mount->mnt_flag & MNT_ASYNC; 55432286Sdyson lblocksize = vp->v_mount->mnt_stat.f_iosize; 55532286Sdyson } else { 55632286Sdyson async = 0; 55732286Sdyson lblocksize = bp->b_bufsize; 55832286Sdyson } 5595455Sdg lbn = bp->b_lblkno; 56042408Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 56134694Sdyson 5621541Srgrimes /* Initialize vnode to beginning of file. */ 5631541Srgrimes if (lbn == 0) 5641541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 5651541Srgrimes 5665455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 5675455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 56851797Sphk maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; 5691541Srgrimes if (vp->v_clen != 0) { 5701541Srgrimes /* 5711541Srgrimes * Next block is not sequential. 5728876Srgrimes * 5731541Srgrimes * If we are not writing at end of file, the process 5745455Sdg * seeked to another point in the file since its last 5755455Sdg * write, or we have reached our maximum cluster size, 5765455Sdg * then push the previous cluster. Otherwise try 5775455Sdg * reallocating to make it sequential. 57858909Sdillon * 57958909Sdillon * Change to algorithm: only push previous cluster if 58058909Sdillon * it was sequential from the point of view of the 58158909Sdillon * seqcount heuristic, otherwise leave the buffer 58258909Sdillon * intact so we can potentially optimize the I/O 58358909Sdillon * later on in the buf_daemon or update daemon 58458909Sdillon * flush. 5851541Srgrimes */ 5861541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 58734611Sdyson if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 58810541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 58958909Sdillon if (!async && seqcount > 0) { 59048677Smckusick cluster_wbuild_wb(vp, lblocksize, 59112404Sdyson vp->v_cstart, cursize); 59258909Sdillon } 59310541Sdyson } else { 59410541Sdyson struct buf **bpp, **endbp; 59510541Sdyson struct cluster_save *buflist; 59610541Sdyson 59710541Sdyson buflist = cluster_collectbufs(vp, bp); 59810541Sdyson endbp = &buflist->bs_children 59910541Sdyson [buflist->bs_nchildren - 1]; 60010541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 60110541Sdyson /* 60258909Sdillon * Failed, push the previous cluster 60358909Sdillon * if *really* writing sequentially 60458909Sdillon * in the logical file (seqcount > 1), 60558909Sdillon * otherwise delay it in the hopes that 60658909Sdillon * the low level disk driver can 60758909Sdillon * optimize the write ordering. 60810541Sdyson */ 60910541Sdyson for (bpp = buflist->bs_children; 61010541Sdyson bpp < endbp; bpp++) 61110541Sdyson brelse(*bpp); 61210541Sdyson free(buflist, M_SEGMENT); 61358909Sdillon if (seqcount > 1) { 61458909Sdillon cluster_wbuild_wb(vp, 61558909Sdillon lblocksize, vp->v_cstart, 61658909Sdillon cursize); 61758909Sdillon } 61810541Sdyson } else { 61910541Sdyson /* 62010541Sdyson * Succeeded, keep building cluster. 62110541Sdyson */ 62210541Sdyson for (bpp = buflist->bs_children; 62310541Sdyson bpp <= endbp; bpp++) 62410541Sdyson bdwrite(*bpp); 62510541Sdyson free(buflist, M_SEGMENT); 62610541Sdyson vp->v_lastw = lbn; 62710541Sdyson vp->v_lasta = bp->b_blkno; 62810541Sdyson return; 62910541Sdyson } 63010541Sdyson } 6311541Srgrimes } 6321541Srgrimes /* 6335455Sdg * Consider beginning a cluster. If at end of file, make 6345455Sdg * cluster as large as possible, otherwise find size of 6355455Sdg * existing cluster. 6361541Srgrimes */ 63732286Sdyson if ((vp->v_type == VREG) && 63834611Sdyson ((u_quad_t) bp->b_offset + lblocksize) != filesize && 6397613Sdg (bp->b_blkno == bp->b_lblkno) && 64010551Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 64110541Sdyson bp->b_blkno == -1)) { 6421541Srgrimes bawrite(bp); 6431541Srgrimes vp->v_clen = 0; 6441541Srgrimes vp->v_lasta = bp->b_blkno; 6451541Srgrimes vp->v_cstart = lbn + 1; 6461541Srgrimes vp->v_lastw = lbn; 6471541Srgrimes return; 6481541Srgrimes } 6495455Sdg vp->v_clen = maxclen; 65012404Sdyson if (!async && maxclen == 0) { /* I/O not contiguous */ 6511541Srgrimes vp->v_cstart = lbn + 1; 65213490Sdyson bawrite(bp); 6535455Sdg } else { /* Wait for rest of cluster */ 6541541Srgrimes vp->v_cstart = lbn; 6555455Sdg bdwrite(bp); 6561541Srgrimes } 6571541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 6581541Srgrimes /* 65958909Sdillon * At end of cluster, write it out if seqcount tells us we 66058909Sdillon * are operating sequentially, otherwise let the buf or 66158909Sdillon * update daemon handle it. 6621541Srgrimes */ 66312404Sdyson bdwrite(bp); 66458909Sdillon if (seqcount > 1) 66558909Sdillon cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 6661541Srgrimes vp->v_clen = 0; 6671541Srgrimes vp->v_cstart = lbn + 1; 66858909Sdillon } else { 6691541Srgrimes /* 6705455Sdg * In the middle of a cluster, so just delay the I/O for now. 6711541Srgrimes */ 6721541Srgrimes bdwrite(bp); 67358909Sdillon } 6741541Srgrimes vp->v_lastw = lbn; 6751541Srgrimes vp->v_lasta = bp->b_blkno; 6761541Srgrimes} 6771541Srgrimes 6781541Srgrimes 6791541Srgrimes/* 6801541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 6811541Srgrimes * The last lbn argument is the current block on which I/O is being 6821541Srgrimes * performed. Check to see that it doesn't fall in the middle of 6831541Srgrimes * the current block (if last_bp == NULL). 6841541Srgrimes */ 68512767Sdysonint 68612404Sdysoncluster_wbuild(vp, size, start_lbn, len) 6871541Srgrimes struct vnode *vp; 6881541Srgrimes long size; 6891541Srgrimes daddr_t start_lbn; 6901541Srgrimes int len; 6911541Srgrimes{ 69212404Sdyson struct buf *bp, *tbp; 6935455Sdg int i, j, s; 69412767Sdyson int totalwritten = 0; 69512404Sdyson int dbsize = btodb(size); 69635595Sbde 69712767Sdyson while (len > 0) { 69812767Sdyson s = splbio(); 69932286Sdyson if (((tbp = gbincore(vp, start_lbn)) == NULL) || 70048225Smckusick ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) || 70148225Smckusick BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { 70212767Sdyson ++start_lbn; 70312767Sdyson --len; 70412767Sdyson splx(s); 70512767Sdyson continue; 70612767Sdyson } 70712767Sdyson bremfree(tbp); 70812767Sdyson tbp->b_flags &= ~B_DONE; 70912767Sdyson splx(s); 7101541Srgrimes 71147967Sjulian /* 71247967Sjulian * Extra memory in the buffer, punt on this buffer. 71347967Sjulian * XXX we could handle this in most cases, but we would 71447967Sjulian * have to push the extra memory down to after our max 71547967Sjulian * possible cluster size and then potentially pull it back 71647967Sjulian * up if the cluster was terminated prematurely--too much 71747967Sjulian * hassle. 71847967Sjulian */ 71914319Sdyson if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 72034630Sjulian (tbp->b_bcount != tbp->b_bufsize) || 72134630Sjulian (tbp->b_bcount != size) || 72234630Sjulian (len == 1) || 72347948Sdg ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { 72412767Sdyson totalwritten += tbp->b_bufsize; 72512767Sdyson bawrite(tbp); 72612767Sdyson ++start_lbn; 72712767Sdyson --len; 72812767Sdyson continue; 72912767Sdyson } 73012404Sdyson 73134630Sjulian /* 73234630Sjulian * We got a pbuf to make the cluster in. 73334630Sjulian * so initialise it. 73434630Sjulian */ 73512767Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 73612767Sdyson bp->b_bcount = 0; 73712767Sdyson bp->b_bufsize = 0; 73812767Sdyson bp->b_npages = 0; 73917304Sdyson if (tbp->b_wcred != NOCRED) { 74017304Sdyson bp->b_wcred = tbp->b_wcred; 74117304Sdyson crhold(bp->b_wcred); 74217304Sdyson } 7431541Srgrimes 74412767Sdyson bp->b_blkno = tbp->b_blkno; 74512767Sdyson bp->b_lblkno = tbp->b_lblkno; 74634611Sdyson bp->b_offset = tbp->b_offset; 74737467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 74837467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 74958345Sphk bp->b_flags |= B_CLUSTER | 75034694Sdyson (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 75112767Sdyson bp->b_iodone = cluster_callback; 75212767Sdyson pbgetvp(vp, bp); 75334630Sjulian /* 75434630Sjulian * From this location in the file, scan forward to see 75534630Sjulian * if there are buffers with adjacent data that need to 75634630Sjulian * be written as well. 75734630Sjulian */ 75812767Sdyson for (i = 0; i < len; ++i, ++start_lbn) { 75934630Sjulian if (i != 0) { /* If not the first buffer */ 76012767Sdyson s = splbio(); 76134630Sjulian /* 76234630Sjulian * If the adjacent data is not even in core it 76334630Sjulian * can't need to be written. 76434630Sjulian */ 76512767Sdyson if ((tbp = gbincore(vp, start_lbn)) == NULL) { 76612767Sdyson splx(s); 76712767Sdyson break; 76812767Sdyson } 7691541Srgrimes 77034630Sjulian /* 77134630Sjulian * If it IS in core, but has different 77234630Sjulian * characteristics, don't cluster with it. 77334630Sjulian */ 77448225Smckusick if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 77548225Smckusick B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 77634694Sdyson != (B_DELWRI | B_CLUSTEROK | 77748225Smckusick (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 77848225Smckusick tbp->b_wcred != bp->b_wcred || 77948225Smckusick BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { 78012767Sdyson splx(s); 78112767Sdyson break; 78212767Sdyson } 78312767Sdyson 78434630Sjulian /* 78534630Sjulian * Check that the combined cluster 78634630Sjulian * would make sense with regard to pages 78734630Sjulian * and would not be too large 78834630Sjulian */ 78912767Sdyson if ((tbp->b_bcount != size) || 79034630Sjulian ((bp->b_blkno + (dbsize * i)) != 79134694Sdyson tbp->b_blkno) || 79234630Sjulian ((tbp->b_npages + bp->b_npages) > 79351797Sphk (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { 79448225Smckusick BUF_UNLOCK(tbp); 79512767Sdyson splx(s); 79612767Sdyson break; 79712767Sdyson } 79834630Sjulian /* 79934630Sjulian * Ok, it's passed all the tests, 80034630Sjulian * so remove it from the free list 80134630Sjulian * and mark it busy. We will use it. 80234630Sjulian */ 80312767Sdyson bremfree(tbp); 80412767Sdyson tbp->b_flags &= ~B_DONE; 80512404Sdyson splx(s); 80634630Sjulian } /* end of code for non-first buffers only */ 80734266Sjulian /* check for latent dependencies to be handled */ 80834266Sjulian if ((LIST_FIRST(&tbp->b_dep)) != NULL && 80934266Sjulian bioops.io_start) 81034266Sjulian (*bioops.io_start)(tbp); 81134630Sjulian /* 81234630Sjulian * If the IO is via the VM then we do some 81334630Sjulian * special VM hackery. (yuck) 81434630Sjulian */ 81513490Sdyson if (tbp->b_flags & B_VMIO) { 81632937Sdyson vm_page_t m; 81732937Sdyson 81834630Sjulian if (i != 0) { /* if not first buffer */ 81932937Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 82032937Sdyson m = tbp->b_pages[j]; 82150701Stegge if (m->flags & PG_BUSY) { 82250701Stegge bqrelse(tbp); 82332937Sdyson goto finishcluster; 82450701Stegge } 82532937Sdyson } 82632937Sdyson } 82732937Sdyson 82813490Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 82913490Sdyson m = tbp->b_pages[j]; 83038799Sdfr vm_page_io_start(m); 83138517Sdfr vm_object_pip_add(m->object, 1); 83213490Sdyson if ((bp->b_npages == 0) || 83334630Sjulian (bp->b_pages[bp->b_npages - 1] != m)) { 83413490Sdyson bp->b_pages[bp->b_npages] = m; 83513490Sdyson bp->b_npages++; 83613490Sdyson } 83712767Sdyson } 83812767Sdyson } 83912767Sdyson bp->b_bcount += size; 84012767Sdyson bp->b_bufsize += size; 8411541Srgrimes 84238299Sdfr s = splbio(); 84344679Sjulian bundirty(tbp); 84458934Sphk tbp->b_flags &= ~B_DONE; 84558934Sphk tbp->b_ioflags &= ~BIO_ERROR; 84612767Sdyson tbp->b_flags |= B_ASYNC; 84758345Sphk tbp->b_iocmd = BIO_WRITE; 84812767Sdyson reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 84912767Sdyson ++tbp->b_vp->v_numoutput; 85038299Sdfr splx(s); 85148333Speter BUF_KERNPROC(tbp); 85212767Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 85312767Sdyson tbp, b_cluster.cluster_entry); 8541541Srgrimes } 85532937Sdyson finishcluster: 85612767Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 85712767Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 85820054Sdyson if (bp->b_bufsize > bp->b_kvasize) 85937559Sbde panic( 86037559Sbde "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 86137559Sbde bp->b_bufsize, bp->b_kvasize); 86220054Sdyson bp->b_kvasize = bp->b_bufsize; 86312767Sdyson totalwritten += bp->b_bufsize; 86417304Sdyson bp->b_dirtyoff = 0; 86517304Sdyson bp->b_dirtyend = bp->b_bufsize; 86612767Sdyson bawrite(bp); 8671541Srgrimes 86812767Sdyson len -= i; 8691541Srgrimes } 87012767Sdyson return totalwritten; 8711541Srgrimes} 8721541Srgrimes 8731541Srgrimes/* 8741541Srgrimes * Collect together all the buffers in a cluster. 8751541Srgrimes * Plus add one additional buffer. 8761541Srgrimes */ 87712973Sbdestatic struct cluster_save * 8781541Srgrimescluster_collectbufs(vp, last_bp) 8791541Srgrimes struct vnode *vp; 8801541Srgrimes struct buf *last_bp; 8811541Srgrimes{ 8821541Srgrimes struct cluster_save *buflist; 88341205Smckusick struct buf *bp; 8845455Sdg daddr_t lbn; 8851541Srgrimes int i, len; 8861541Srgrimes 8871541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 8881541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 8891541Srgrimes M_SEGMENT, M_WAITOK); 8901541Srgrimes buflist->bs_nchildren = 0; 8915455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 89241205Smckusick for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 89341205Smckusick (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 89441205Smckusick buflist->bs_children[i] = bp; 89541205Smckusick if (bp->b_blkno == bp->b_lblkno) 89641205Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 89741205Smckusick NULL, NULL); 89841205Smckusick } 89941529Smckusick buflist->bs_children[i] = bp = last_bp; 90041529Smckusick if (bp->b_blkno == bp->b_lblkno) 90141529Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 90241529Smckusick NULL, NULL); 9031541Srgrimes buflist->bs_nchildren = i + 1; 9041541Srgrimes return (buflist); 9051541Srgrimes} 906