vfs_cluster.c revision 6837
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 366837Sdg * $Id: vfs_cluster.c,v 1.10 1995/02/22 09:39:20 davidg Exp $ 371541Srgrimes */ 381541Srgrimes 391541Srgrimes#include <sys/param.h> 401549Srgrimes#include <sys/systm.h> 411541Srgrimes#include <sys/proc.h> 421541Srgrimes#include <sys/buf.h> 431541Srgrimes#include <sys/vnode.h> 441541Srgrimes#include <sys/mount.h> 451541Srgrimes#include <sys/trace.h> 461541Srgrimes#include <sys/malloc.h> 471541Srgrimes#include <sys/resourcevar.h> 485455Sdg#include <sys/vmmeter.h> 495455Sdg#include <miscfs/specfs/specdev.h> 506621Sdg#include <vm/vm.h> 516621Sdg#include <vm/vm_pageout.h> 521541Srgrimes 531541Srgrimes#ifdef DEBUG 541541Srgrimes#include <vm/vm.h> 551541Srgrimes#include <sys/sysctl.h> 563055Sdgint doreallocblks = 0; 575455Sdgstruct ctldebug debug13 = {"doreallocblks", &doreallocblks}; 585455Sdg 591541Srgrimes#else 601541Srgrimes/* XXX for cluster_write */ 613055Sdg#define doreallocblks 0 621541Srgrimes#endif 631541Srgrimes 641541Srgrimes/* 651541Srgrimes * Local declarations 661541Srgrimes */ 671541Srgrimesstruct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 685455Sdg daddr_t, daddr_t, long, int, long)); 695455Sdgvoid cluster_wbuild __P((struct vnode *, struct buf *, long, daddr_t, int, daddr_t)); 701541Srgrimesstruct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 711541Srgrimes 725455Sdgint totreads; 735455Sdgint totreadblocks; 745455Sdg 751541Srgrimes#ifdef DIAGNOSTIC 761541Srgrimes/* 771541Srgrimes * Set to 1 if reads of block zero should cause readahead to be done. 781541Srgrimes * Set to 0 treats a read of block zero as a non-sequential read. 791541Srgrimes * 801541Srgrimes * Setting to one assumes that most reads of block zero of files are due to 811541Srgrimes * sequential passes over the files (e.g. cat, sum) where additional blocks 821541Srgrimes * will soon be needed. Setting to zero assumes that the majority are 831541Srgrimes * surgical strikes to get particular info (e.g. size, file) where readahead 841541Srgrimes * blocks will not be used and, in fact, push out other potentially useful 851541Srgrimes * blocks from the cache. The former seems intuitive, but some quick tests 861541Srgrimes * showed that the latter performed better from a system-wide point of view. 871541Srgrimes */ 885455Sdg int doclusterraz = 0; 895455Sdg 901541Srgrimes#define ISSEQREAD(vp, blk) \ 911541Srgrimes (((blk) != 0 || doclusterraz) && \ 921541Srgrimes ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 931541Srgrimes#else 941541Srgrimes#define ISSEQREAD(vp, blk) \ 955839Sdg (/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 961541Srgrimes#endif 971541Srgrimes 981541Srgrimes/* 991541Srgrimes * This replaces bread. If this is a bread at the beginning of a file and 1001541Srgrimes * lastr is 0, we assume this is the first read and we'll read up to two 1011541Srgrimes * blocks if they are sequential. After that, we'll do regular read ahead 1021541Srgrimes * in clustered chunks. 1031541Srgrimes * bp is the block requested. 1041541Srgrimes * rbp is the read-ahead block. 1051541Srgrimes * If either is NULL, then you don't have to do the I/O. 1061541Srgrimes */ 1071549Srgrimesint 1081541Srgrimescluster_read(vp, filesize, lblkno, size, cred, bpp) 1091541Srgrimes struct vnode *vp; 1101541Srgrimes u_quad_t filesize; 1111541Srgrimes daddr_t lblkno; 1121541Srgrimes long size; 1131541Srgrimes struct ucred *cred; 1141541Srgrimes struct buf **bpp; 1151541Srgrimes{ 1161541Srgrimes struct buf *bp, *rbp; 1175455Sdg daddr_t blkno, rablkno, origlblkno; 1181541Srgrimes long flags; 1191541Srgrimes int error, num_ra, alreadyincore; 1201541Srgrimes 1215455Sdg origlblkno = lblkno; 1221541Srgrimes error = 0; 1235455Sdg /* 1245455Sdg * get the requested block 1255455Sdg */ 1261541Srgrimes *bpp = bp = getblk(vp, lblkno, size, 0, 0); 1275455Sdg /* 1285455Sdg * if it is in the cache, then check to see if the reads have been 1295455Sdg * sequential. If they have, then try some read-ahead, otherwise 1305455Sdg * back-off on prospective read-aheads. 1315455Sdg */ 1321541Srgrimes if (bp->b_flags & B_CACHE) { 1335455Sdg int i; 1345455Sdg 1355455Sdg if (!ISSEQREAD(vp, origlblkno)) { 1366621Sdg vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; 1375455Sdg vp->v_ralen >>= 1; 1385455Sdg return 0; 1396621Sdg } else if( vp->v_maxra >= origlblkno) { 1405839Sdg if ((vp->v_ralen + 1) < (MAXPHYS / size)) 1415839Sdg vp->v_ralen++; 1426621Sdg if ( vp->v_maxra >= (origlblkno + vp->v_ralen)) 1435839Sdg return 0; 1446621Sdg lblkno = vp->v_maxra; 1455839Sdg } 1465455Sdg bp = NULL; 1475455Sdg } else { 1481541Srgrimes /* 1495455Sdg * if it isn't in the cache, then get a chunk from disk if 1505455Sdg * sequential, otherwise just get the block. 1511541Srgrimes */ 1521541Srgrimes bp->b_flags |= B_READ; 1535455Sdg lblkno += 1; 1545455Sdg curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 1551541Srgrimes } 1561541Srgrimes /* 1575455Sdg * if ralen is "none", then try a little 1581541Srgrimes */ 1595455Sdg if (vp->v_ralen == 0) 1605455Sdg vp->v_ralen = 1; 1615455Sdg /* 1625455Sdg * assume no read-ahead 1635455Sdg */ 1645455Sdg alreadyincore = 1; 1655455Sdg rablkno = lblkno; 1665455Sdg 1675455Sdg /* 1685455Sdg * if we have been doing sequential I/O, then do some read-ahead 1695455Sdg */ 1705455Sdg if (ISSEQREAD(vp, origlblkno)) { 1715455Sdg int i; 1725455Sdg 1731541Srgrimes /* 1745455Sdg * this code makes sure that the stuff that we have read-ahead 1755455Sdg * is still in the cache. If it isn't, we have been reading 1765455Sdg * ahead too much, and we need to back-off, otherwise we might 1775455Sdg * try to read more. 1781541Srgrimes */ 1795455Sdg for (i = 0; i < vp->v_ralen; i++) { 1805455Sdg rablkno = lblkno + i; 1815455Sdg alreadyincore = (int) incore(vp, rablkno); 1825455Sdg if (!alreadyincore) { 1835455Sdg if (rablkno < vp->v_maxra) { 1845455Sdg vp->v_maxra = rablkno; 1855455Sdg vp->v_ralen >>= 1; 1865455Sdg alreadyincore = 1; 1875455Sdg } else { 1886621Sdg if (inmem(vp, rablkno)) { 1896621Sdg if( vp->v_maxra < rablkno) 1906621Sdg vp->v_maxra = rablkno + 1; 1915455Sdg continue; 1926621Sdg } 1935455Sdg if ((vp->v_ralen + 1) < MAXPHYS / size) 1945455Sdg vp->v_ralen++; 1955455Sdg } 1965455Sdg break; 1976621Sdg } else if( vp->v_maxra < rablkno) { 1986621Sdg vp->v_maxra = rablkno + 1; 1995455Sdg } 2001541Srgrimes } 2015455Sdg } 2025455Sdg /* 2035455Sdg * we now build the read-ahead buffer if it is desirable. 2045455Sdg */ 2055455Sdg rbp = NULL; 2065455Sdg if (!alreadyincore && 2075455Sdg (rablkno + 1) * size <= filesize && 2085455Sdg !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra)) && 2095455Sdg blkno != -1) { 2105839Sdg if ((vp->v_ralen + 1) < MAXPHYS / size) 2115839Sdg vp->v_ralen++; 2125455Sdg if (num_ra > vp->v_ralen) 2135455Sdg num_ra = vp->v_ralen; 2141541Srgrimes 2156621Sdg if (num_ra) { 2161541Srgrimes rbp = cluster_rbuild(vp, filesize, 2175455Sdg NULL, rablkno, blkno, size, num_ra, B_READ | B_ASYNC); 2181541Srgrimes } else { 2195455Sdg rbp = getblk(vp, rablkno, size, 0, 0); 2205455Sdg rbp->b_flags |= B_READ | B_ASYNC; 2211541Srgrimes rbp->b_blkno = blkno; 2221541Srgrimes } 2231541Srgrimes } 2245839Sdg 2255455Sdg /* 2265455Sdg * if the synchronous read is a cluster, handle it, otherwise do a 2275455Sdg * simple, non-clustered read. 2285455Sdg */ 2295455Sdg if (bp) { 2301541Srgrimes if (bp->b_flags & (B_DONE | B_DELWRI)) 2311541Srgrimes panic("cluster_read: DONE bp"); 2325455Sdg else { 2335455Sdg vfs_busy_pages(bp, 0); 2341541Srgrimes error = VOP_STRATEGY(bp); 2355455Sdg vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; 2365455Sdg totreads++; 2375455Sdg totreadblocks += bp->b_bcount / size; 2385455Sdg curproc->p_stats->p_ru.ru_inblock++; 2395455Sdg } 2405455Sdg } 2415455Sdg /* 2425455Sdg * and if we have read-aheads, do them too 2435455Sdg */ 2445455Sdg if (rbp) { 2456621Sdg vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size; 2465455Sdg if (error || (rbp->b_flags & B_CACHE)) { 2471541Srgrimes rbp->b_flags &= ~(B_ASYNC | B_READ); 2481541Srgrimes brelse(rbp); 2495455Sdg } else { 2505455Sdg vfs_busy_pages(rbp, 0); 2511541Srgrimes (void) VOP_STRATEGY(rbp); 2525455Sdg totreads++; 2535455Sdg totreadblocks += rbp->b_bcount / size; 2545455Sdg curproc->p_stats->p_ru.ru_inblock++; 2555455Sdg } 2565455Sdg } 2575839Sdg if (bp && ((bp->b_flags & B_ASYNC) == 0)) 2585455Sdg return (biowait(bp)); 2595455Sdg return (error); 2601541Srgrimes} 2611541Srgrimes 2621541Srgrimes/* 2631541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 2641541Srgrimes * read ahead. We will read as many blocks as possible sequentially 2651541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 2661541Srgrimes */ 2671541Srgrimesstruct buf * 2681541Srgrimescluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 2691541Srgrimes struct vnode *vp; 2701541Srgrimes u_quad_t filesize; 2711541Srgrimes struct buf *bp; 2721541Srgrimes daddr_t lbn; 2731541Srgrimes daddr_t blkno; 2741541Srgrimes long size; 2751541Srgrimes int run; 2761541Srgrimes long flags; 2771541Srgrimes{ 2781541Srgrimes struct cluster_save *b_save; 2791541Srgrimes struct buf *tbp; 2801541Srgrimes daddr_t bn; 2815455Sdg int i, inc, j; 2821541Srgrimes 2831541Srgrimes#ifdef DIAGNOSTIC 2841541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 2851541Srgrimes panic("cluster_rbuild: size %d != filesize %d\n", 2865455Sdg size, vp->v_mount->mnt_stat.f_iosize); 2871541Srgrimes#endif 2881541Srgrimes if (size * (lbn + run + 1) > filesize) 2891541Srgrimes --run; 2901541Srgrimes if (run == 0) { 2911541Srgrimes if (!bp) { 2921541Srgrimes bp = getblk(vp, lbn, size, 0, 0); 2931541Srgrimes bp->b_blkno = blkno; 2941541Srgrimes bp->b_flags |= flags; 2951541Srgrimes } 2965455Sdg return (bp); 2971541Srgrimes } 2985455Sdg tbp = bp; 2995455Sdg if (!tbp) { 3005455Sdg tbp = getblk(vp, lbn, size, 0, 0); 3015455Sdg } 3025455Sdg if (tbp->b_flags & B_CACHE) { 3035455Sdg return (tbp); 3045455Sdg } else if (bp == NULL) { 3055455Sdg tbp->b_flags |= B_ASYNC; 3065455Sdg } 3075455Sdg bp = getpbuf(); 3085455Sdg bp->b_flags = flags | B_CALL | B_BUSY | B_CLUSTER; 3095455Sdg bp->b_iodone = cluster_callback; 3105455Sdg bp->b_blkno = blkno; 3115455Sdg bp->b_lblkno = lbn; 3125455Sdg pbgetvp(vp, bp); 3131541Srgrimes 3145142Sdg b_save = malloc(sizeof(struct buf *) * (run + 1) + sizeof(struct cluster_save), 3151541Srgrimes M_SEGMENT, M_WAITOK); 3161541Srgrimes b_save->bs_nchildren = 0; 3175455Sdg b_save->bs_children = (struct buf **) (b_save + 1); 3185455Sdg bp->b_saveaddr = b_save; 3191541Srgrimes 3205455Sdg bp->b_bcount = 0; 3215455Sdg bp->b_bufsize = 0; 3225455Sdg bp->b_npages = 0; 3235455Sdg 3245455Sdg if (tbp->b_flags & B_VMIO) 3255455Sdg bp->b_flags |= B_VMIO; 3265455Sdg 3271541Srgrimes inc = btodb(size); 3285455Sdg for (bn = blkno, i = 0; i <= run; ++i, bn += inc) { 3295455Sdg if (i != 0) { 3305455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 3315455Sdg if ((tbp->b_flags & B_CACHE) || 3325455Sdg (tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO)) { 3335455Sdg brelse(tbp); 3345455Sdg break; 3355455Sdg } 3365455Sdg tbp->b_blkno = bn; 3375455Sdg tbp->b_flags |= flags | B_READ | B_ASYNC; 3385455Sdg } else { 3395455Sdg tbp->b_flags |= flags | B_READ; 3401541Srgrimes } 3411541Srgrimes ++b_save->bs_nchildren; 3425455Sdg b_save->bs_children[i] = tbp; 3435455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 3445455Sdg bp->b_pages[j + bp->b_npages] = tbp->b_pages[j]; 3451541Srgrimes } 3465455Sdg bp->b_npages += tbp->b_npages; 3475455Sdg bp->b_bcount += size; 3485455Sdg bp->b_bufsize += size; 3491541Srgrimes } 3506621Sdg pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *)bp->b_pages, bp->b_npages); 3515455Sdg return (bp); 3521541Srgrimes} 3531541Srgrimes 3541541Srgrimes/* 3551541Srgrimes * Cleanup after a clustered read or write. 3561541Srgrimes * This is complicated by the fact that any of the buffers might have 3571541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 3581541Srgrimes * that we will need to shift around. 3591541Srgrimes */ 3601541Srgrimesvoid 3611541Srgrimescluster_callback(bp) 3621541Srgrimes struct buf *bp; 3631541Srgrimes{ 3641541Srgrimes struct cluster_save *b_save; 3651541Srgrimes struct buf **bpp, *tbp; 3661541Srgrimes caddr_t cp; 3671541Srgrimes int error = 0; 3681541Srgrimes 3691541Srgrimes /* 3701541Srgrimes * Must propogate errors to all the components. 3711541Srgrimes */ 3721541Srgrimes if (bp->b_flags & B_ERROR) 3731541Srgrimes error = bp->b_error; 3741541Srgrimes 3755455Sdg b_save = (struct cluster_save *) (bp->b_saveaddr); 3766621Sdg pmap_qremove((vm_offset_t) bp->b_data, bp->b_npages); 3771541Srgrimes /* 3781541Srgrimes * Move memory from the large cluster buffer into the component 3791541Srgrimes * buffers and mark IO as done on these. 3801541Srgrimes */ 3811541Srgrimes for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 3821541Srgrimes tbp = *bpp; 3831541Srgrimes if (error) { 3841541Srgrimes tbp->b_flags |= B_ERROR; 3851541Srgrimes tbp->b_error = error; 3861541Srgrimes } 3871541Srgrimes biodone(tbp); 3881541Srgrimes } 3891541Srgrimes free(b_save, M_SEGMENT); 3905455Sdg relpbuf(bp); 3911541Srgrimes} 3921541Srgrimes 3931541Srgrimes/* 3941541Srgrimes * Do clustered write for FFS. 3951541Srgrimes * 3961541Srgrimes * Three cases: 3971541Srgrimes * 1. Write is not sequential (write asynchronously) 3981541Srgrimes * Write is sequential: 3991541Srgrimes * 2. beginning of cluster - begin cluster 4001541Srgrimes * 3. middle of a cluster - add to cluster 4011541Srgrimes * 4. end of a cluster - asynchronously write cluster 4021541Srgrimes */ 4031541Srgrimesvoid 4041541Srgrimescluster_write(bp, filesize) 4055455Sdg struct buf *bp; 4061541Srgrimes u_quad_t filesize; 4071541Srgrimes{ 4085455Sdg struct vnode *vp; 4095455Sdg daddr_t lbn; 4105455Sdg int maxclen, cursize; 4115455Sdg int lblocksize; 4121541Srgrimes 4135455Sdg vp = bp->b_vp; 4145455Sdg lblocksize = vp->v_mount->mnt_stat.f_iosize; 4155455Sdg lbn = bp->b_lblkno; 4161541Srgrimes 4171541Srgrimes /* Initialize vnode to beginning of file. */ 4181541Srgrimes if (lbn == 0) 4191541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 4201541Srgrimes 4215455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 4225455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 4235839Sdg maxclen = MAXPHYS / lblocksize - 1; 4241541Srgrimes if (vp->v_clen != 0) { 4251541Srgrimes /* 4261541Srgrimes * Next block is not sequential. 4275455Sdg * 4281541Srgrimes * If we are not writing at end of file, the process 4295455Sdg * seeked to another point in the file since its last 4305455Sdg * write, or we have reached our maximum cluster size, 4315455Sdg * then push the previous cluster. Otherwise try 4325455Sdg * reallocating to make it sequential. 4331541Srgrimes */ 4341541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 4355455Sdg cluster_wbuild(vp, NULL, lblocksize, 4365455Sdg vp->v_cstart, cursize, lbn); 4371541Srgrimes } 4381541Srgrimes /* 4395455Sdg * Consider beginning a cluster. If at end of file, make 4405455Sdg * cluster as large as possible, otherwise find size of 4415455Sdg * existing cluster. 4421541Srgrimes */ 4435455Sdg if ((lbn + 1) * lblocksize != filesize && 4441541Srgrimes (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || 4455455Sdg bp->b_blkno == -1)) { 4461541Srgrimes bawrite(bp); 4471541Srgrimes vp->v_clen = 0; 4481541Srgrimes vp->v_lasta = bp->b_blkno; 4491541Srgrimes vp->v_cstart = lbn + 1; 4501541Srgrimes vp->v_lastw = lbn; 4511541Srgrimes return; 4521541Srgrimes } 4535455Sdg vp->v_clen = maxclen; 4545455Sdg if (maxclen == 0) { /* I/O not contiguous */ 4551541Srgrimes vp->v_cstart = lbn + 1; 4565455Sdg bawrite(bp); 4575455Sdg } else { /* Wait for rest of cluster */ 4581541Srgrimes vp->v_cstart = lbn; 4595455Sdg bdwrite(bp); 4601541Srgrimes } 4611541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 4621541Srgrimes /* 4631541Srgrimes * At end of cluster, write it out. 4641541Srgrimes */ 4651541Srgrimes cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 4661541Srgrimes vp->v_clen + 1, lbn); 4671541Srgrimes vp->v_clen = 0; 4681541Srgrimes vp->v_cstart = lbn + 1; 4691541Srgrimes } else 4701541Srgrimes /* 4715455Sdg * In the middle of a cluster, so just delay the I/O for now. 4721541Srgrimes */ 4731541Srgrimes bdwrite(bp); 4741541Srgrimes vp->v_lastw = lbn; 4751541Srgrimes vp->v_lasta = bp->b_blkno; 4761541Srgrimes} 4771541Srgrimes 4781541Srgrimes 4791541Srgrimes/* 4801541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 4811541Srgrimes * The last lbn argument is the current block on which I/O is being 4821541Srgrimes * performed. Check to see that it doesn't fall in the middle of 4831541Srgrimes * the current block (if last_bp == NULL). 4841541Srgrimes */ 4851541Srgrimesvoid 4861541Srgrimescluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 4871541Srgrimes struct vnode *vp; 4881541Srgrimes struct buf *last_bp; 4891541Srgrimes long size; 4901541Srgrimes daddr_t start_lbn; 4911541Srgrimes int len; 4925455Sdg daddr_t lbn; 4931541Srgrimes{ 4941541Srgrimes struct cluster_save *b_save; 4955839Sdg struct buf *bp, *tbp, *pb; 4965455Sdg caddr_t cp; 4975455Sdg int i, j, s; 4981541Srgrimes 4991541Srgrimes#ifdef DIAGNOSTIC 5001541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 5011541Srgrimes panic("cluster_wbuild: size %d != filesize %d\n", 5025455Sdg size, vp->v_mount->mnt_stat.f_iosize); 5031541Srgrimes#endif 5041541Srgrimesredo: 5056837Sdg if( (lbn != -1) || (last_bp == 0)) { 5066837Sdg while ((!(tbp = incore(vp, start_lbn)) || (tbp->b_flags & B_BUSY) 5076837Sdg || (start_lbn == lbn)) && len) { 5086837Sdg ++start_lbn; 5096837Sdg --len; 5106837Sdg } 5111541Srgrimes 5126837Sdg pb = (struct buf *) trypbuf(); 5136837Sdg /* Get more memory for current buffer */ 5146837Sdg if (len <= 1 || pb == 0) { 5156837Sdg relpbuf(pb); 5166837Sdg if (last_bp) { 5176837Sdg bawrite(last_bp); 5186837Sdg } else if (len) { 5196837Sdg bp = getblk(vp, start_lbn, size, 0, 0); 5206837Sdg bawrite(bp); 5216837Sdg } 5226837Sdg return; 5231541Srgrimes } 5246837Sdg tbp = getblk(vp, start_lbn, size, 0, 0); 5256837Sdg } else { 5266837Sdg tbp = last_bp; 5276837Sdg if( tbp->b_flags & B_BUSY) { 5286837Sdg printf("vfs_cluster: warning: buffer already busy\n"); 5296837Sdg } 5306837Sdg tbp->b_flags |= B_BUSY; 5316837Sdg last_bp = 0; 5326837Sdg pb = (struct buf *) trypbuf(); 5336837Sdg if( pb == 0) { 5346837Sdg bawrite(tbp); 5356837Sdg return; 5366837Sdg } 5371541Srgrimes } 5386837Sdg 5395455Sdg if (!(tbp->b_flags & B_DELWRI)) { 5405839Sdg relpbuf(pb); 5411541Srgrimes ++start_lbn; 5421541Srgrimes --len; 5435455Sdg brelse(tbp); 5441541Srgrimes goto redo; 5451541Srgrimes } 5461541Srgrimes /* 5475455Sdg * Extra memory in the buffer, punt on this buffer. XXX we could 5485455Sdg * handle this in most cases, but we would have to push the extra 5495455Sdg * memory down to after our max possible cluster size and then 5505455Sdg * potentially pull it back up if the cluster was terminated 5515455Sdg * prematurely--too much hassle. 5521541Srgrimes */ 5535455Sdg if (tbp->b_bcount != tbp->b_bufsize) { 5545839Sdg relpbuf(pb); 5551541Srgrimes ++start_lbn; 5561541Srgrimes --len; 5575455Sdg bawrite(tbp); 5581541Srgrimes goto redo; 5591541Srgrimes } 5605839Sdg bp = pb; 5615455Sdg b_save = malloc(sizeof(struct buf *) * (len + 1) + sizeof(struct cluster_save), 5621541Srgrimes M_SEGMENT, M_WAITOK); 5631541Srgrimes b_save->bs_nchildren = 0; 5645455Sdg b_save->bs_children = (struct buf **) (b_save + 1); 5655455Sdg bp->b_saveaddr = b_save; 5665455Sdg bp->b_bcount = 0; 5675455Sdg bp->b_bufsize = 0; 5685455Sdg bp->b_npages = 0; 5691541Srgrimes 5705455Sdg if (tbp->b_flags & B_VMIO) 5715455Sdg bp->b_flags |= B_VMIO; 5725455Sdg 5735455Sdg bp->b_blkno = tbp->b_blkno; 5745455Sdg bp->b_lblkno = tbp->b_lblkno; 5755455Sdg bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER; 5761541Srgrimes bp->b_iodone = cluster_callback; 5775455Sdg pbgetvp(vp, bp); 5781541Srgrimes 5795455Sdg for (i = 0; i < len; ++i, ++start_lbn) { 5805455Sdg if (i != 0) { 5815455Sdg /* 5825455Sdg * Block is not in core or the non-sequential block 5835455Sdg * ending our cluster was part of the cluster (in 5845455Sdg * which case we don't want to write it twice). 5855455Sdg */ 5865455Sdg if (!(tbp = incore(vp, start_lbn)) || 5875455Sdg (last_bp == NULL && start_lbn == lbn)) 5881541Srgrimes break; 5891541Srgrimes 5905839Sdg if ((tbp->b_flags & (B_INVAL | B_CLUSTEROK)) != B_CLUSTEROK) 5915455Sdg break; 5921541Srgrimes 5936837Sdg if ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE)) 5946837Sdg break; 5956837Sdg 5965455Sdg /* 5975455Sdg * Get the desired block buffer (unless it is the 5985455Sdg * final sequential block whose buffer was passed in 5995455Sdg * explictly as last_bp). 6005455Sdg */ 6015455Sdg if (last_bp == NULL || start_lbn != lbn) { 6025839Sdg if( tbp->b_flags & B_BUSY) 6035839Sdg break; 6045455Sdg tbp = getblk(vp, start_lbn, size, 0, 0); 6055455Sdg if (!(tbp->b_flags & B_DELWRI) || 6065455Sdg ((tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO))) { 6075455Sdg brelse(tbp); 6085455Sdg break; 6095455Sdg } 6105455Sdg } else 6115455Sdg tbp = last_bp; 6121541Srgrimes } 6135455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 6145455Sdg bp->b_pages[j + bp->b_npages] = tbp->b_pages[j]; 6155455Sdg } 6165455Sdg bp->b_npages += tbp->b_npages; 6171541Srgrimes bp->b_bcount += size; 6181541Srgrimes bp->b_bufsize += size; 6191541Srgrimes 6201541Srgrimes tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 6211937Sdg tbp->b_flags |= B_ASYNC; 6221541Srgrimes s = splbio(); 6235455Sdg reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 6241541Srgrimes ++tbp->b_vp->v_numoutput; 6251541Srgrimes splx(s); 6261541Srgrimes b_save->bs_children[i] = tbp; 6271541Srgrimes } 6285455Sdg b_save->bs_nchildren = i; 6296621Sdg pmap_qenter((vm_offset_t) bp->b_data, (vm_page_t *) bp->b_pages, bp->b_npages); 6305455Sdg bawrite(bp); 6311541Srgrimes 6321541Srgrimes if (i < len) { 6335455Sdg len -= i; 6341541Srgrimes goto redo; 6351541Srgrimes } 6361541Srgrimes} 6371541Srgrimes 6381541Srgrimes/* 6391541Srgrimes * Collect together all the buffers in a cluster. 6401541Srgrimes * Plus add one additional buffer. 6411541Srgrimes */ 6421541Srgrimesstruct cluster_save * 6431541Srgrimescluster_collectbufs(vp, last_bp) 6441541Srgrimes struct vnode *vp; 6451541Srgrimes struct buf *last_bp; 6461541Srgrimes{ 6471541Srgrimes struct cluster_save *buflist; 6485455Sdg daddr_t lbn; 6491541Srgrimes int i, len; 6501541Srgrimes 6511541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 6521541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 6531541Srgrimes M_SEGMENT, M_WAITOK); 6541541Srgrimes buflist->bs_nchildren = 0; 6555455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 6561541Srgrimes for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 6575455Sdg (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, 6585455Sdg &buflist->bs_children[i]); 6591541Srgrimes buflist->bs_children[i] = last_bp; 6601541Srgrimes buflist->bs_nchildren = i + 1; 6611541Srgrimes return (buflist); 6621541Srgrimes} 663