vfs_cluster.c revision 41529
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 3641529Smckusick * $Id: vfs_cluster.c,v 1.74 1998/11/17 00:31:12 mckusick Exp $ 371541Srgrimes */ 381541Srgrimes 3932929Seivind#include "opt_debug_cluster.h" 4032929Seivind 411541Srgrimes#include <sys/param.h> 421549Srgrimes#include <sys/systm.h> 4341168Sbde#include <sys/kernel.h> 441541Srgrimes#include <sys/proc.h> 451541Srgrimes#include <sys/buf.h> 461541Srgrimes#include <sys/vnode.h> 4741124Sdg#include <sys/malloc.h> 481541Srgrimes#include <sys/mount.h> 491541Srgrimes#include <sys/resourcevar.h> 506621Sdg#include <vm/vm.h> 5112662Sdg#include <vm/vm_prot.h> 5210541Sdyson#include <vm/vm_object.h> 5310541Sdyson#include <vm/vm_page.h> 541541Srgrimes 5521002Sdyson#if defined(CLUSTERDEBUG) 5621002Sdyson#include <sys/sysctl.h> 5721002Sdysonstatic int rcluster= 0; 5824484SbdeSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 5921002Sdyson#endif 6021002Sdyson 6141124Sdgstatic MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 6241124Sdg 6312973Sbdestatic struct cluster_save * 6412973Sbde cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 6512973Sbdestatic struct buf * 6612973Sbde cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 6721002Sdyson daddr_t blkno, long size, int run, struct buf *fbp)); 681541Srgrimes 6912973Sbdeextern vm_page_t bogus_page; 705455Sdg 711541Srgrimes/* 7221002Sdyson * Maximum number of blocks for read-ahead. 731541Srgrimes */ 7421002Sdyson#define MAXRA 32 755455Sdg 761541Srgrimes/* 7721002Sdyson * This replaces bread. 7810541Sdyson */ 791549Srgrimesint 8021002Sdysoncluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 811541Srgrimes struct vnode *vp; 821541Srgrimes u_quad_t filesize; 831541Srgrimes daddr_t lblkno; 841541Srgrimes long size; 851541Srgrimes struct ucred *cred; 8621002Sdyson long totread; 8721002Sdyson int seqcount; 881541Srgrimes struct buf **bpp; 891541Srgrimes{ 9021002Sdyson struct buf *bp, *rbp, *reqbp; 9131016Sphk daddr_t blkno, origblkno; 9221002Sdyson int error, num_ra; 9310541Sdyson int i; 9421002Sdyson int maxra, racluster; 9521002Sdyson long origtotread; 961541Srgrimes 971541Srgrimes error = 0; 9832724Sdyson if (vp->v_maxio == 0) 9932724Sdyson vp->v_maxio = DFLTPHYS; 10021002Sdyson 1015455Sdg /* 10221002Sdyson * Try to limit the amount of read-ahead by a few 10321002Sdyson * ad-hoc parameters. This needs work!!! 10421002Sdyson */ 10532724Sdyson racluster = vp->v_maxio/size; 10621002Sdyson maxra = 2 * racluster + (totread / size); 10721002Sdyson if (maxra > MAXRA) 10821002Sdyson maxra = MAXRA; 10921002Sdyson if (maxra > nbuf/8) 11021002Sdyson maxra = nbuf/8; 11121002Sdyson 11221002Sdyson /* 1135455Sdg * get the requested block 1145455Sdg */ 11521002Sdyson *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 11621002Sdyson origblkno = lblkno; 11721002Sdyson origtotread = totread; 11812767Sdyson 1195455Sdg /* 1205455Sdg * if it is in the cache, then check to see if the reads have been 1215455Sdg * sequential. If they have, then try some read-ahead, otherwise 1225455Sdg * back-off on prospective read-aheads. 1235455Sdg */ 1241541Srgrimes if (bp->b_flags & B_CACHE) { 12521002Sdyson if (!seqcount) { 1265455Sdg return 0; 12721002Sdyson } else if ((bp->b_flags & B_RAM) == 0) { 12821002Sdyson return 0; 12921002Sdyson } else { 13021002Sdyson int s; 13121002Sdyson struct buf *tbp; 13221002Sdyson bp->b_flags &= ~B_RAM; 13321002Sdyson /* 13421002Sdyson * We do the spl here so that there is no window 13521002Sdyson * between the incore and the b_usecount increment 13621002Sdyson * below. We opt to keep the spl out of the loop 13721002Sdyson * for efficiency. 13821002Sdyson */ 13921002Sdyson s = splbio(); 14021002Sdyson for(i=1;i<maxra;i++) { 14121002Sdyson 14221002Sdyson if (!(tbp = incore(vp, lblkno+i))) { 14321002Sdyson break; 14421002Sdyson } 14521002Sdyson 14621002Sdyson /* 14721002Sdyson * Set another read-ahead mark so we know to check 14821002Sdyson * again. 14921002Sdyson */ 15021002Sdyson if (((i % racluster) == (racluster - 1)) || 15121002Sdyson (i == (maxra - 1))) 15221002Sdyson tbp->b_flags |= B_RAM; 15321002Sdyson 15434694Sdyson if ((tbp->b_usecount < 1) && 15534206Sdyson ((tbp->b_flags & B_BUSY) == 0) && 15634206Sdyson (tbp->b_qindex == QUEUE_LRU)) { 15734206Sdyson TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist); 15834206Sdyson TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist); 15921002Sdyson } 16021002Sdyson } 16121002Sdyson splx(s); 16221002Sdyson if (i >= maxra) { 1635839Sdg return 0; 16410541Sdyson } 16521002Sdyson lblkno += i; 16621002Sdyson } 16721002Sdyson reqbp = bp = NULL; 16821002Sdyson } else { 16934611Sdyson off_t firstread; 17034611Sdyson firstread = bp->b_offset; 17134694Sdyson#ifdef DIAGNOSTIC 17234694Sdyson if (bp->b_offset == NOOFFSET) 17334694Sdyson panic("cluster_read: no buffer offset"); 17434694Sdyson#endif 17521002Sdyson if (firstread + totread > filesize) 17621002Sdyson totread = filesize - firstread; 17721002Sdyson if (totread > size) { 17821002Sdyson int nblks = 0; 17921002Sdyson int ncontigafter; 18021002Sdyson while (totread > 0) { 18121002Sdyson nblks++; 18221002Sdyson totread -= size; 18321002Sdyson } 18421002Sdyson if (nblks == 1) 18521002Sdyson goto single_block_read; 18621002Sdyson if (nblks > racluster) 18721002Sdyson nblks = racluster; 18821002Sdyson 18921002Sdyson error = VOP_BMAP(vp, lblkno, NULL, 19021002Sdyson &blkno, &ncontigafter, NULL); 19121002Sdyson if (error) 19221002Sdyson goto single_block_read; 19321002Sdyson if (blkno == -1) 19421002Sdyson goto single_block_read; 19521002Sdyson if (ncontigafter == 0) 19621002Sdyson goto single_block_read; 19721002Sdyson if (ncontigafter + 1 < nblks) 19821002Sdyson nblks = ncontigafter + 1; 19921002Sdyson 20021002Sdyson bp = cluster_rbuild(vp, filesize, lblkno, 20121002Sdyson blkno, size, nblks, bp); 20234694Sdyson lblkno += (bp->b_bufsize / size); 20310541Sdyson } else { 20421002Sdysonsingle_block_read: 20521002Sdyson /* 20621002Sdyson * if it isn't in the cache, then get a chunk from 20721002Sdyson * disk if sequential, otherwise just get the block. 20821002Sdyson */ 20921002Sdyson bp->b_flags |= B_READ | B_RAM; 21010541Sdyson lblkno += 1; 2118876Srgrimes } 2121541Srgrimes } 2135455Sdg 2145455Sdg /* 2155455Sdg * if we have been doing sequential I/O, then do some read-ahead 2165455Sdg */ 21721002Sdyson rbp = NULL; 21821002Sdyson if (seqcount && (lblkno < (origblkno + seqcount))) { 2191541Srgrimes /* 22021002Sdyson * we now build the read-ahead buffer if it is desirable. 2211541Srgrimes */ 22221002Sdyson if (((u_quad_t)(lblkno + 1) * size) <= filesize && 22321002Sdyson !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 22421002Sdyson blkno != -1) { 22521002Sdyson int nblksread; 22621002Sdyson int ntoread = num_ra + 1; 22721002Sdyson nblksread = (origtotread + size - 1) / size; 22821002Sdyson if (seqcount < nblksread) 22921002Sdyson seqcount = nblksread; 23021002Sdyson if (seqcount < ntoread) 23121002Sdyson ntoread = seqcount; 23221002Sdyson if (num_ra) { 23321002Sdyson rbp = cluster_rbuild(vp, filesize, lblkno, 23421002Sdyson blkno, size, ntoread, NULL); 23521002Sdyson } else { 23621002Sdyson rbp = getblk(vp, lblkno, size, 0, 0); 23721002Sdyson rbp->b_flags |= B_READ | B_ASYNC | B_RAM; 23821002Sdyson rbp->b_blkno = blkno; 2395455Sdg } 2401541Srgrimes } 2415455Sdg } 2421541Srgrimes 2435455Sdg /* 24410541Sdyson * handle the synchronous read 2455455Sdg */ 2465455Sdg if (bp) { 24721002Sdyson#if defined(CLUSTERDEBUG) 24836275Sdyson if (rcluster) 24937951Sbde printf("S(%ld,%ld,%d) ", 25037951Sbde (long)bp->b_lblkno, bp->b_bcount, seqcount); 25121002Sdyson#endif 25236275Sdyson if ((bp->b_flags & B_CLUSTER) == 0) 25336275Sdyson vfs_busy_pages(bp, 0); 25437384Sjulian error = VOP_STRATEGY(vp, bp); 25536275Sdyson curproc->p_stats->p_ru.ru_inblock++; 2565455Sdg } 25734611Sdyson 2585455Sdg /* 2595455Sdg * and if we have read-aheads, do them too 2605455Sdg */ 2615455Sdg if (rbp) { 26213490Sdyson if (error) { 2631541Srgrimes rbp->b_flags &= ~(B_ASYNC | B_READ); 2641541Srgrimes brelse(rbp); 26513490Sdyson } else if (rbp->b_flags & B_CACHE) { 26613490Sdyson rbp->b_flags &= ~(B_ASYNC | B_READ); 26713490Sdyson bqrelse(rbp); 2685455Sdg } else { 26921002Sdyson#if defined(CLUSTERDEBUG) 27021002Sdyson if (rcluster) { 27121002Sdyson if (bp) 27237951Sbde printf("A+(%ld,%ld,%ld,%d) ", 27337951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27437951Sbde (long)(rbp->b_lblkno - origblkno), 27537951Sbde seqcount); 27621002Sdyson else 27737951Sbde printf("A(%ld,%ld,%ld,%d) ", 27837951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27937951Sbde (long)(rbp->b_lblkno - origblkno), 28037951Sbde seqcount); 28121002Sdyson } 28221002Sdyson#endif 28321002Sdyson 28410541Sdyson if ((rbp->b_flags & B_CLUSTER) == 0) 28510541Sdyson vfs_busy_pages(rbp, 0); 28637384Sjulian (void) VOP_STRATEGY(vp, rbp); 2875455Sdg curproc->p_stats->p_ru.ru_inblock++; 2885455Sdg } 2895455Sdg } 29021002Sdyson if (reqbp) 29121002Sdyson return (biowait(reqbp)); 29221002Sdyson else 29321002Sdyson return (error); 2941541Srgrimes} 2951541Srgrimes 2961541Srgrimes/* 2971541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 2981541Srgrimes * read ahead. We will read as many blocks as possible sequentially 2991541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 3001541Srgrimes */ 30110541Sdysonstatic struct buf * 30221002Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 3031541Srgrimes struct vnode *vp; 3041541Srgrimes u_quad_t filesize; 3051541Srgrimes daddr_t lbn; 3061541Srgrimes daddr_t blkno; 3071541Srgrimes long size; 3081541Srgrimes int run; 30921002Sdyson struct buf *fbp; 3101541Srgrimes{ 31110541Sdyson struct buf *bp, *tbp; 3121541Srgrimes daddr_t bn; 31340648Sphk int i, inc, j; 3141541Srgrimes 3151541Srgrimes#ifdef DIAGNOSTIC 3161541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 31737951Sbde panic("cluster_rbuild: size %ld != filesize %ld\n", 3185455Sdg size, vp->v_mount->mnt_stat.f_iosize); 3191541Srgrimes#endif 32012767Sdyson /* 32112767Sdyson * avoid a division 32212767Sdyson */ 32312767Sdyson while ((u_quad_t) size * (lbn + run) > filesize) { 3241541Srgrimes --run; 32512767Sdyson } 32610541Sdyson 32721002Sdyson if (fbp) { 32821002Sdyson tbp = fbp; 32921002Sdyson tbp->b_flags |= B_READ; 33021002Sdyson } else { 33121002Sdyson tbp = getblk(vp, lbn, size, 0, 0); 33221002Sdyson if (tbp->b_flags & B_CACHE) 33321002Sdyson return tbp; 33421002Sdyson tbp->b_flags |= B_ASYNC | B_READ | B_RAM; 33521002Sdyson } 33610541Sdyson 33710541Sdyson tbp->b_blkno = blkno; 33816086Sdyson if( (tbp->b_flags & B_MALLOC) || 33916086Sdyson ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 34010541Sdyson return tbp; 34110541Sdyson 34210541Sdyson bp = trypbuf(); 34310541Sdyson if (bp == 0) 34410541Sdyson return tbp; 34510541Sdyson 34637467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 34737467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 34810541Sdyson bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 3495455Sdg bp->b_iodone = cluster_callback; 3505455Sdg bp->b_blkno = blkno; 3515455Sdg bp->b_lblkno = lbn; 35234611Sdyson bp->b_offset = tbp->b_offset; 35334694Sdyson#ifdef DIAGNOSTIC 35434694Sdyson if (bp->b_offset == NOOFFSET) 35534694Sdyson panic("cluster_rbuild: no buffer offset"); 35634694Sdyson#endif 3575455Sdg pbgetvp(vp, bp); 3581541Srgrimes 35912404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 3601541Srgrimes 3615455Sdg bp->b_bcount = 0; 3625455Sdg bp->b_bufsize = 0; 3635455Sdg bp->b_npages = 0; 3645455Sdg 36532724Sdyson if (vp->v_maxio == 0) 36632724Sdyson vp->v_maxio = DFLTPHYS; 3671541Srgrimes inc = btodb(size); 36810541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3695455Sdg if (i != 0) { 37012767Sdyson if ((bp->b_npages * PAGE_SIZE) + 37132724Sdyson round_page(size) > vp->v_maxio) 37210541Sdyson break; 37310978Sdyson 37434611Sdyson if (tbp = incore(vp, lbn + i)) { 37534611Sdyson if (tbp->b_flags & B_BUSY) 37634611Sdyson break; 37712767Sdyson 37834611Sdyson for (j = 0; j < tbp->b_npages; j++) 37934611Sdyson if (tbp->b_pages[j]->valid) 38034611Sdyson break; 38134611Sdyson 38234611Sdyson if (j != tbp->b_npages) 38334611Sdyson break; 38434611Sdyson 38534611Sdyson if (tbp->b_bcount != size) 38634611Sdyson break; 38734611Sdyson } 38834611Sdyson 3895455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 39010541Sdyson 3915455Sdg if ((tbp->b_flags & B_CACHE) || 39210541Sdyson (tbp->b_flags & B_VMIO) == 0) { 39313490Sdyson bqrelse(tbp); 3945455Sdg break; 3955455Sdg } 39610541Sdyson 39734611Sdyson for (j = 0;j < tbp->b_npages; j++) 39834611Sdyson if (tbp->b_pages[j]->valid) 39910541Sdyson break; 40010541Sdyson 40110541Sdyson if (j != tbp->b_npages) { 40234611Sdyson bqrelse(tbp); 40310541Sdyson break; 40410541Sdyson } 40510541Sdyson 40621002Sdyson if ((fbp && (i == 1)) || (i == (run - 1))) 40721002Sdyson tbp->b_flags |= B_RAM; 40810541Sdyson tbp->b_flags |= B_READ | B_ASYNC; 40912767Sdyson if (tbp->b_blkno == tbp->b_lblkno) { 41010541Sdyson tbp->b_blkno = bn; 41110541Sdyson } else if (tbp->b_blkno != bn) { 41210541Sdyson brelse(tbp); 41310541Sdyson break; 41410541Sdyson } 4151541Srgrimes } 41612404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 41712404Sdyson tbp, b_cluster.cluster_entry); 4185455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 41910541Sdyson vm_page_t m; 42010541Sdyson m = tbp->b_pages[j]; 42138799Sdfr vm_page_io_start(m); 42238517Sdfr vm_object_pip_add(m->object, 1); 42310541Sdyson if ((bp->b_npages == 0) || 42412413Sdyson (bp->b_pages[bp->b_npages-1] != m)) { 42510541Sdyson bp->b_pages[bp->b_npages] = m; 42610541Sdyson bp->b_npages++; 42710541Sdyson } 42818737Sdyson if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 42918737Sdyson tbp->b_pages[j] = bogus_page; 4301541Srgrimes } 43110541Sdyson bp->b_bcount += tbp->b_bcount; 43210541Sdyson bp->b_bufsize += tbp->b_bufsize; 4331541Srgrimes } 43418737Sdyson 43518737Sdyson for(j=0;j<bp->b_npages;j++) { 43618737Sdyson if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 43718737Sdyson VM_PAGE_BITS_ALL) 43818737Sdyson bp->b_pages[j] = bogus_page; 43918737Sdyson } 44020054Sdyson if (bp->b_bufsize > bp->b_kvasize) 44137559Sbde panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 44237559Sbde bp->b_bufsize, bp->b_kvasize); 44320054Sdyson bp->b_kvasize = bp->b_bufsize; 44418737Sdyson 44510541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 44610541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 4475455Sdg return (bp); 4481541Srgrimes} 4491541Srgrimes 4501541Srgrimes/* 4511541Srgrimes * Cleanup after a clustered read or write. 4521541Srgrimes * This is complicated by the fact that any of the buffers might have 4531541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 4541541Srgrimes * that we will need to shift around. 4551541Srgrimes */ 4561541Srgrimesvoid 4571541Srgrimescluster_callback(bp) 4581541Srgrimes struct buf *bp; 4591541Srgrimes{ 46012404Sdyson struct buf *nbp, *tbp; 4611541Srgrimes int error = 0; 4621541Srgrimes 4631541Srgrimes /* 4641541Srgrimes * Must propogate errors to all the components. 4651541Srgrimes */ 4661541Srgrimes if (bp->b_flags & B_ERROR) 4671541Srgrimes error = bp->b_error; 4681541Srgrimes 46910541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 4701541Srgrimes /* 4711541Srgrimes * Move memory from the large cluster buffer into the component 4721541Srgrimes * buffers and mark IO as done on these. 4731541Srgrimes */ 47421002Sdyson for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 47512404Sdyson tbp; tbp = nbp) { 47621002Sdyson nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 4771541Srgrimes if (error) { 4781541Srgrimes tbp->b_flags |= B_ERROR; 4791541Srgrimes tbp->b_error = error; 48025135Sdfr } else 48125135Sdfr tbp->b_dirtyoff = tbp->b_dirtyend = 0; 4821541Srgrimes biodone(tbp); 4831541Srgrimes } 4845455Sdg relpbuf(bp); 4851541Srgrimes} 4861541Srgrimes 4871541Srgrimes/* 4881541Srgrimes * Do clustered write for FFS. 4891541Srgrimes * 4901541Srgrimes * Three cases: 4911541Srgrimes * 1. Write is not sequential (write asynchronously) 4921541Srgrimes * Write is sequential: 4931541Srgrimes * 2. beginning of cluster - begin cluster 4941541Srgrimes * 3. middle of a cluster - add to cluster 4951541Srgrimes * 4. end of a cluster - asynchronously write cluster 4961541Srgrimes */ 4971541Srgrimesvoid 4981541Srgrimescluster_write(bp, filesize) 4995455Sdg struct buf *bp; 5001541Srgrimes u_quad_t filesize; 5011541Srgrimes{ 5025455Sdg struct vnode *vp; 5035455Sdg daddr_t lbn; 5045455Sdg int maxclen, cursize; 5055455Sdg int lblocksize; 50612404Sdyson int async; 5071541Srgrimes 5085455Sdg vp = bp->b_vp; 50932724Sdyson if (vp->v_maxio == 0) 51032724Sdyson vp->v_maxio = DFLTPHYS; 51132286Sdyson if (vp->v_type == VREG) { 51232286Sdyson async = vp->v_mount->mnt_flag & MNT_ASYNC; 51332286Sdyson lblocksize = vp->v_mount->mnt_stat.f_iosize; 51432286Sdyson } else { 51532286Sdyson async = 0; 51632286Sdyson lblocksize = bp->b_bufsize; 51732286Sdyson } 5185455Sdg lbn = bp->b_lblkno; 5191541Srgrimes 52034694Sdyson#ifdef DIAGNOSTIC 52134694Sdyson if (bp->b_offset == NOOFFSET) 52234694Sdyson panic("cluster_write: no buffer offset"); 52334694Sdyson#endif 52434694Sdyson 5251541Srgrimes /* Initialize vnode to beginning of file. */ 5261541Srgrimes if (lbn == 0) 5271541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 5281541Srgrimes 5295455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 5305455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 53132724Sdyson maxclen = vp->v_maxio / lblocksize - 1; 5321541Srgrimes if (vp->v_clen != 0) { 5331541Srgrimes /* 5341541Srgrimes * Next block is not sequential. 5358876Srgrimes * 5361541Srgrimes * If we are not writing at end of file, the process 5375455Sdg * seeked to another point in the file since its last 5385455Sdg * write, or we have reached our maximum cluster size, 5395455Sdg * then push the previous cluster. Otherwise try 5405455Sdg * reallocating to make it sequential. 5411541Srgrimes */ 5421541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 54334611Sdyson if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 54410541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 54512404Sdyson if (!async) 54612404Sdyson cluster_wbuild(vp, lblocksize, 54712404Sdyson vp->v_cstart, cursize); 54810541Sdyson } else { 54910541Sdyson struct buf **bpp, **endbp; 55010541Sdyson struct cluster_save *buflist; 55110541Sdyson 55210541Sdyson buflist = cluster_collectbufs(vp, bp); 55310541Sdyson endbp = &buflist->bs_children 55410541Sdyson [buflist->bs_nchildren - 1]; 55510541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 55610541Sdyson /* 55710541Sdyson * Failed, push the previous cluster. 55810541Sdyson */ 55910541Sdyson for (bpp = buflist->bs_children; 56010541Sdyson bpp < endbp; bpp++) 56110541Sdyson brelse(*bpp); 56210541Sdyson free(buflist, M_SEGMENT); 56312404Sdyson cluster_wbuild(vp, lblocksize, 56412404Sdyson vp->v_cstart, cursize); 56510541Sdyson } else { 56610541Sdyson /* 56710541Sdyson * Succeeded, keep building cluster. 56810541Sdyson */ 56910541Sdyson for (bpp = buflist->bs_children; 57010541Sdyson bpp <= endbp; bpp++) 57110541Sdyson bdwrite(*bpp); 57210541Sdyson free(buflist, M_SEGMENT); 57310541Sdyson vp->v_lastw = lbn; 57410541Sdyson vp->v_lasta = bp->b_blkno; 57510541Sdyson return; 57610541Sdyson } 57710541Sdyson } 5781541Srgrimes } 5791541Srgrimes /* 5805455Sdg * Consider beginning a cluster. If at end of file, make 5815455Sdg * cluster as large as possible, otherwise find size of 5825455Sdg * existing cluster. 5831541Srgrimes */ 58432286Sdyson if ((vp->v_type == VREG) && 58534611Sdyson ((u_quad_t) bp->b_offset + lblocksize) != filesize && 5867613Sdg (bp->b_blkno == bp->b_lblkno) && 58710551Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 58810541Sdyson bp->b_blkno == -1)) { 5891541Srgrimes bawrite(bp); 5901541Srgrimes vp->v_clen = 0; 5911541Srgrimes vp->v_lasta = bp->b_blkno; 5921541Srgrimes vp->v_cstart = lbn + 1; 5931541Srgrimes vp->v_lastw = lbn; 5941541Srgrimes return; 5951541Srgrimes } 5965455Sdg vp->v_clen = maxclen; 59712404Sdyson if (!async && maxclen == 0) { /* I/O not contiguous */ 5981541Srgrimes vp->v_cstart = lbn + 1; 59913490Sdyson bawrite(bp); 6005455Sdg } else { /* Wait for rest of cluster */ 6011541Srgrimes vp->v_cstart = lbn; 6025455Sdg bdwrite(bp); 6031541Srgrimes } 6041541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 6051541Srgrimes /* 6061541Srgrimes * At end of cluster, write it out. 6071541Srgrimes */ 60812404Sdyson bdwrite(bp); 60913490Sdyson cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 6101541Srgrimes vp->v_clen = 0; 6111541Srgrimes vp->v_cstart = lbn + 1; 6121541Srgrimes } else 6131541Srgrimes /* 6145455Sdg * In the middle of a cluster, so just delay the I/O for now. 6151541Srgrimes */ 6161541Srgrimes bdwrite(bp); 6171541Srgrimes vp->v_lastw = lbn; 6181541Srgrimes vp->v_lasta = bp->b_blkno; 6191541Srgrimes} 6201541Srgrimes 6211541Srgrimes 6221541Srgrimes/* 6231541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 6241541Srgrimes * The last lbn argument is the current block on which I/O is being 6251541Srgrimes * performed. Check to see that it doesn't fall in the middle of 6261541Srgrimes * the current block (if last_bp == NULL). 6271541Srgrimes */ 62812767Sdysonint 62912404Sdysoncluster_wbuild(vp, size, start_lbn, len) 6301541Srgrimes struct vnode *vp; 6311541Srgrimes long size; 6321541Srgrimes daddr_t start_lbn; 6331541Srgrimes int len; 6341541Srgrimes{ 63512404Sdyson struct buf *bp, *tbp; 6365455Sdg int i, j, s; 63712767Sdyson int totalwritten = 0; 63812404Sdyson int dbsize = btodb(size); 63935595Sbde 64035595Sbde if (vp->v_maxio == 0) 64135595Sbde vp->v_maxio = DFLTPHYS; 64212767Sdyson while (len > 0) { 64312767Sdyson s = splbio(); 64432286Sdyson if (((tbp = gbincore(vp, start_lbn)) == NULL) || 64534630Sjulian ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { 64612767Sdyson ++start_lbn; 64712767Sdyson --len; 64812767Sdyson splx(s); 64912767Sdyson continue; 65012767Sdyson } 65112767Sdyson bremfree(tbp); 65212767Sdyson tbp->b_flags |= B_BUSY; 65312767Sdyson tbp->b_flags &= ~B_DONE; 65412767Sdyson splx(s); 6551541Srgrimes 6561541Srgrimes /* 6575455Sdg * Extra memory in the buffer, punt on this buffer. XXX we could 6585455Sdg * handle this in most cases, but we would have to push the extra 6595455Sdg * memory down to after our max possible cluster size and then 6605455Sdg * potentially pull it back up if the cluster was terminated 6615455Sdg * prematurely--too much hassle. 6621541Srgrimes */ 66314319Sdyson if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 66434630Sjulian (tbp->b_bcount != tbp->b_bufsize) || 66534630Sjulian (tbp->b_bcount != size) || 66634630Sjulian (len == 1) || 66734630Sjulian ((bp = trypbuf()) == NULL)) { 66812767Sdyson totalwritten += tbp->b_bufsize; 66912767Sdyson bawrite(tbp); 67012767Sdyson ++start_lbn; 67112767Sdyson --len; 67212767Sdyson continue; 67312767Sdyson } 67412404Sdyson 67534630Sjulian /* 67634630Sjulian * We got a pbuf to make the cluster in. 67734630Sjulian * so initialise it. 67834630Sjulian */ 67912767Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 68012767Sdyson bp->b_bcount = 0; 68112767Sdyson bp->b_bufsize = 0; 68212767Sdyson bp->b_npages = 0; 68317304Sdyson if (tbp->b_wcred != NOCRED) { 68417304Sdyson bp->b_wcred = tbp->b_wcred; 68517304Sdyson crhold(bp->b_wcred); 68617304Sdyson } 6871541Srgrimes 68812767Sdyson bp->b_blkno = tbp->b_blkno; 68912767Sdyson bp->b_lblkno = tbp->b_lblkno; 69034611Sdyson bp->b_offset = tbp->b_offset; 69137467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 69237467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 69332286Sdyson bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | 69434694Sdyson (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 69512767Sdyson bp->b_iodone = cluster_callback; 69612767Sdyson pbgetvp(vp, bp); 69734630Sjulian /* 69834630Sjulian * From this location in the file, scan forward to see 69934630Sjulian * if there are buffers with adjacent data that need to 70034630Sjulian * be written as well. 70134630Sjulian */ 70212767Sdyson for (i = 0; i < len; ++i, ++start_lbn) { 70334630Sjulian if (i != 0) { /* If not the first buffer */ 70412767Sdyson s = splbio(); 70534630Sjulian /* 70634630Sjulian * If the adjacent data is not even in core it 70734630Sjulian * can't need to be written. 70834630Sjulian */ 70912767Sdyson if ((tbp = gbincore(vp, start_lbn)) == NULL) { 71012767Sdyson splx(s); 71112767Sdyson break; 71212767Sdyson } 7131541Srgrimes 71434630Sjulian /* 71534630Sjulian * If it IS in core, but has different 71634630Sjulian * characteristics, don't cluster with it. 71734630Sjulian */ 71834630Sjulian if ((tbp->b_flags & 71934694Sdyson (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY | 72034694Sdyson B_DELWRI | B_NEEDCOMMIT)) 72134694Sdyson != (B_DELWRI | B_CLUSTEROK | 72234694Sdyson (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) { 72312767Sdyson splx(s); 72412767Sdyson break; 72512767Sdyson } 72612767Sdyson 72717304Sdyson if (tbp->b_wcred != bp->b_wcred) { 72817304Sdyson splx(s); 72917304Sdyson break; 73017304Sdyson } 73117304Sdyson 73234630Sjulian /* 73334630Sjulian * Check that the combined cluster 73434630Sjulian * would make sense with regard to pages 73534630Sjulian * and would not be too large 73634630Sjulian */ 73712767Sdyson if ((tbp->b_bcount != size) || 73834630Sjulian ((bp->b_blkno + (dbsize * i)) != 73934694Sdyson tbp->b_blkno) || 74034630Sjulian ((tbp->b_npages + bp->b_npages) > 74134630Sjulian (vp->v_maxio / PAGE_SIZE))) { 74212767Sdyson splx(s); 74312767Sdyson break; 74412767Sdyson } 74534630Sjulian /* 74634630Sjulian * Ok, it's passed all the tests, 74734630Sjulian * so remove it from the free list 74834630Sjulian * and mark it busy. We will use it. 74934630Sjulian */ 75012767Sdyson bremfree(tbp); 75112767Sdyson tbp->b_flags |= B_BUSY; 75212767Sdyson tbp->b_flags &= ~B_DONE; 75312404Sdyson splx(s); 75434630Sjulian } /* end of code for non-first buffers only */ 75534266Sjulian /* check for latent dependencies to be handled */ 75634266Sjulian if ((LIST_FIRST(&tbp->b_dep)) != NULL && 75734266Sjulian bioops.io_start) 75834266Sjulian (*bioops.io_start)(tbp); 75934630Sjulian /* 76034630Sjulian * If the IO is via the VM then we do some 76134630Sjulian * special VM hackery. (yuck) 76234630Sjulian */ 76313490Sdyson if (tbp->b_flags & B_VMIO) { 76432937Sdyson vm_page_t m; 76532937Sdyson 76634630Sjulian if (i != 0) { /* if not first buffer */ 76732937Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 76832937Sdyson m = tbp->b_pages[j]; 76932937Sdyson if (m->flags & PG_BUSY) 77032937Sdyson goto finishcluster; 77132937Sdyson } 77232937Sdyson } 77332937Sdyson 77413490Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 77513490Sdyson m = tbp->b_pages[j]; 77638799Sdfr vm_page_io_start(m); 77738517Sdfr vm_object_pip_add(m->object, 1); 77813490Sdyson if ((bp->b_npages == 0) || 77934630Sjulian (bp->b_pages[bp->b_npages - 1] != m)) { 78013490Sdyson bp->b_pages[bp->b_npages] = m; 78113490Sdyson bp->b_npages++; 78213490Sdyson } 78312767Sdyson } 78412767Sdyson } 78512767Sdyson bp->b_bcount += size; 78612767Sdyson bp->b_bufsize += size; 7871541Srgrimes 78838299Sdfr s = splbio(); 78926664Sdyson --numdirtybuffers; 79012767Sdyson tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 79112767Sdyson tbp->b_flags |= B_ASYNC; 79212767Sdyson reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 79312767Sdyson ++tbp->b_vp->v_numoutput; 79438299Sdfr splx(s); 79512767Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 79612767Sdyson tbp, b_cluster.cluster_entry); 7971541Srgrimes } 79832937Sdyson finishcluster: 79912767Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 80012767Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 80120054Sdyson if (bp->b_bufsize > bp->b_kvasize) 80237559Sbde panic( 80337559Sbde "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 80437559Sbde bp->b_bufsize, bp->b_kvasize); 80520054Sdyson bp->b_kvasize = bp->b_bufsize; 80612767Sdyson totalwritten += bp->b_bufsize; 80717304Sdyson bp->b_dirtyoff = 0; 80817304Sdyson bp->b_dirtyend = bp->b_bufsize; 80912767Sdyson bawrite(bp); 8101541Srgrimes 81112767Sdyson len -= i; 8121541Srgrimes } 81312767Sdyson return totalwritten; 8141541Srgrimes} 8151541Srgrimes 8161541Srgrimes/* 8171541Srgrimes * Collect together all the buffers in a cluster. 8181541Srgrimes * Plus add one additional buffer. 8191541Srgrimes */ 82012973Sbdestatic struct cluster_save * 8211541Srgrimescluster_collectbufs(vp, last_bp) 8221541Srgrimes struct vnode *vp; 8231541Srgrimes struct buf *last_bp; 8241541Srgrimes{ 8251541Srgrimes struct cluster_save *buflist; 82641205Smckusick struct buf *bp; 8275455Sdg daddr_t lbn; 8281541Srgrimes int i, len; 8291541Srgrimes 8301541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 8311541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 8321541Srgrimes M_SEGMENT, M_WAITOK); 8331541Srgrimes buflist->bs_nchildren = 0; 8345455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 83541205Smckusick for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 83641205Smckusick (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 83741205Smckusick buflist->bs_children[i] = bp; 83841205Smckusick if (bp->b_blkno == bp->b_lblkno) 83941205Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 84041205Smckusick NULL, NULL); 84141205Smckusick } 84241529Smckusick buflist->bs_children[i] = bp = last_bp; 84341529Smckusick if (bp->b_blkno == bp->b_lblkno) 84441529Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 84541529Smckusick NULL, NULL); 8461541Srgrimes buflist->bs_nchildren = i + 1; 8471541Srgrimes return (buflist); 8481541Srgrimes} 849