vfs_cluster.c revision 10541
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 3610541Sdyson * $Id: vfs_cluster.c,v 1.17 1995/06/28 12:31:47 davidg Exp $ 371541Srgrimes */ 381541Srgrimes 391541Srgrimes#include <sys/param.h> 401549Srgrimes#include <sys/systm.h> 411541Srgrimes#include <sys/proc.h> 421541Srgrimes#include <sys/buf.h> 431541Srgrimes#include <sys/vnode.h> 441541Srgrimes#include <sys/mount.h> 451541Srgrimes#include <sys/malloc.h> 461541Srgrimes#include <sys/resourcevar.h> 475455Sdg#include <sys/vmmeter.h> 485455Sdg#include <miscfs/specfs/specdev.h> 496621Sdg#include <vm/vm.h> 5010541Sdyson#include <vm/vm_object.h> 5110541Sdyson#include <vm/vm_page.h> 521541Srgrimes 531541Srgrimes#ifdef DEBUG 541541Srgrimes#include <vm/vm.h> 551541Srgrimes#include <sys/sysctl.h> 563055Sdgint doreallocblks = 0; 575455Sdgstruct ctldebug debug13 = {"doreallocblks", &doreallocblks}; 585455Sdg 591541Srgrimes#else 601541Srgrimes/* XXX for cluster_write */ 613055Sdg#define doreallocblks 0 621541Srgrimes#endif 631541Srgrimes 641541Srgrimes/* 651541Srgrimes * Local declarations 661541Srgrimes */ 6710541Sdysonstatic struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, 6810541Sdyson daddr_t, daddr_t, long, int)); 691541Srgrimesstruct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 701541Srgrimes 715455Sdgint totreads; 725455Sdgint totreadblocks; 7310541Sdysonextern vm_page_t bogus_page; 745455Sdg 751541Srgrimes#ifdef DIAGNOSTIC 761541Srgrimes/* 771541Srgrimes * Set to 1 if reads of block zero should cause readahead to be done. 781541Srgrimes * Set to 0 treats a read of block zero as a non-sequential read. 791541Srgrimes * 801541Srgrimes * Setting to one assumes that most reads of block zero of files are due to 811541Srgrimes * sequential passes over the files (e.g. cat, sum) where additional blocks 821541Srgrimes * will soon be needed. Setting to zero assumes that the majority are 831541Srgrimes * surgical strikes to get particular info (e.g. size, file) where readahead 841541Srgrimes * blocks will not be used and, in fact, push out other potentially useful 851541Srgrimes * blocks from the cache. The former seems intuitive, but some quick tests 861541Srgrimes * showed that the latter performed better from a system-wide point of view. 871541Srgrimes */ 885455Sdg int doclusterraz = 0; 895455Sdg 901541Srgrimes#define ISSEQREAD(vp, blk) \ 911541Srgrimes (((blk) != 0 || doclusterraz) && \ 921541Srgrimes ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 931541Srgrimes#else 941541Srgrimes#define ISSEQREAD(vp, blk) \ 955839Sdg (/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 961541Srgrimes#endif 971541Srgrimes 981541Srgrimes/* 9910541Sdyson * allow for three entire read-aheads... The system will 10010541Sdyson * adjust downwards rapidly if needed... 10110541Sdyson */ 10210541Sdyson#define RA_MULTIPLE_FAST 2 10310541Sdyson#define RA_MULTIPLE_SLOW 3 10410541Sdyson#define RA_SHIFTDOWN 1 /* approx lg2(RA_MULTIPLE) */ 10510541Sdyson/* 1061541Srgrimes * This replaces bread. If this is a bread at the beginning of a file and 1071541Srgrimes * lastr is 0, we assume this is the first read and we'll read up to two 1081541Srgrimes * blocks if they are sequential. After that, we'll do regular read ahead 1091541Srgrimes * in clustered chunks. 1101541Srgrimes * bp is the block requested. 1111541Srgrimes * rbp is the read-ahead block. 1121541Srgrimes * If either is NULL, then you don't have to do the I/O. 1131541Srgrimes */ 1141549Srgrimesint 1151541Srgrimescluster_read(vp, filesize, lblkno, size, cred, bpp) 1161541Srgrimes struct vnode *vp; 1171541Srgrimes u_quad_t filesize; 1181541Srgrimes daddr_t lblkno; 1191541Srgrimes long size; 1201541Srgrimes struct ucred *cred; 1211541Srgrimes struct buf **bpp; 1221541Srgrimes{ 1231541Srgrimes struct buf *bp, *rbp; 1245455Sdg daddr_t blkno, rablkno, origlblkno; 1251541Srgrimes long flags; 1261541Srgrimes int error, num_ra, alreadyincore; 12710541Sdyson int i; 12810541Sdyson int seq; 1291541Srgrimes 1301541Srgrimes error = 0; 1315455Sdg /* 1325455Sdg * get the requested block 1335455Sdg */ 13410541Sdyson origlblkno = lblkno; 1351541Srgrimes *bpp = bp = getblk(vp, lblkno, size, 0, 0); 13610541Sdyson seq = ISSEQREAD(vp, lblkno); 1375455Sdg /* 1385455Sdg * if it is in the cache, then check to see if the reads have been 1395455Sdg * sequential. If they have, then try some read-ahead, otherwise 1405455Sdg * back-off on prospective read-aheads. 1415455Sdg */ 1421541Srgrimes if (bp->b_flags & B_CACHE) { 14310541Sdyson if (!seq) { 1446621Sdg vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; 14510541Sdyson vp->v_ralen >>= RA_SHIFTDOWN; 1465455Sdg return 0; 14710541Sdyson } else if( vp->v_maxra > lblkno) { 14810541Sdyson if ( (vp->v_maxra + (vp->v_ralen / RA_MULTIPLE_SLOW)) >= (lblkno + vp->v_ralen)) { 14910541Sdyson if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST*(MAXPHYS / size)) 15010541Sdyson ++vp->v_ralen; 1515839Sdg return 0; 15210541Sdyson } 1536621Sdg lblkno = vp->v_maxra; 15410541Sdyson } else { 15510541Sdyson lblkno += 1; 1568876Srgrimes } 1575455Sdg bp = NULL; 1585455Sdg } else { 1591541Srgrimes /* 1605455Sdg * if it isn't in the cache, then get a chunk from disk if 1615455Sdg * sequential, otherwise just get the block. 1621541Srgrimes */ 1631541Srgrimes bp->b_flags |= B_READ; 1645455Sdg lblkno += 1; 1655455Sdg curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 16610541Sdyson vp->v_ralen = 0; 1671541Srgrimes } 1681541Srgrimes /* 1695455Sdg * assume no read-ahead 1705455Sdg */ 1715455Sdg alreadyincore = 1; 1725455Sdg rablkno = lblkno; 1735455Sdg 1745455Sdg /* 1755455Sdg * if we have been doing sequential I/O, then do some read-ahead 1765455Sdg */ 17710541Sdyson if (seq) { 1785455Sdg 17910541Sdyson /* 18010541Sdyson * bump ralen a bit... 18110541Sdyson */ 18210541Sdyson if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size)) 18310541Sdyson ++vp->v_ralen; 1841541Srgrimes /* 1855455Sdg * this code makes sure that the stuff that we have read-ahead 1865455Sdg * is still in the cache. If it isn't, we have been reading 1875455Sdg * ahead too much, and we need to back-off, otherwise we might 1885455Sdg * try to read more. 1891541Srgrimes */ 1905455Sdg for (i = 0; i < vp->v_ralen; i++) { 1915455Sdg rablkno = lblkno + i; 1925455Sdg alreadyincore = (int) incore(vp, rablkno); 1935455Sdg if (!alreadyincore) { 19410541Sdyson if (inmem(vp, rablkno)) { 19510541Sdyson struct buf *bpt; 19610541Sdyson if (vp->v_maxra < rablkno) 19710541Sdyson vp->v_maxra = rablkno + 1; 19810541Sdyson continue; 19910541Sdyson } 2005455Sdg if (rablkno < vp->v_maxra) { 2015455Sdg vp->v_maxra = rablkno; 20210541Sdyson vp->v_ralen >>= RA_SHIFTDOWN; 2035455Sdg alreadyincore = 1; 2045455Sdg } 2055455Sdg break; 20610541Sdyson } else if (vp->v_maxra < rablkno) { 2076621Sdg vp->v_maxra = rablkno + 1; 2085455Sdg } 2091541Srgrimes } 2105455Sdg } 2115455Sdg /* 2125455Sdg * we now build the read-ahead buffer if it is desirable. 2135455Sdg */ 2145455Sdg rbp = NULL; 2155455Sdg if (!alreadyincore && 2165455Sdg (rablkno + 1) * size <= filesize && 21710541Sdyson !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) && 2185455Sdg blkno != -1) { 2195455Sdg if (num_ra > vp->v_ralen) 2205455Sdg num_ra = vp->v_ralen; 2211541Srgrimes 2226621Sdg if (num_ra) { 22310541Sdyson rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size, 22410541Sdyson num_ra + 1); 2251541Srgrimes } else { 2265455Sdg rbp = getblk(vp, rablkno, size, 0, 0); 2275455Sdg rbp->b_flags |= B_READ | B_ASYNC; 2281541Srgrimes rbp->b_blkno = blkno; 2291541Srgrimes } 2301541Srgrimes } 2315839Sdg 2325455Sdg /* 23310541Sdyson * handle the synchronous read 2345455Sdg */ 2355455Sdg if (bp) { 2361541Srgrimes if (bp->b_flags & (B_DONE | B_DELWRI)) 2371541Srgrimes panic("cluster_read: DONE bp"); 2385455Sdg else { 2395455Sdg vfs_busy_pages(bp, 0); 2401541Srgrimes error = VOP_STRATEGY(bp); 2415455Sdg vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; 2425455Sdg totreads++; 2435455Sdg totreadblocks += bp->b_bcount / size; 2445455Sdg curproc->p_stats->p_ru.ru_inblock++; 2455455Sdg } 2465455Sdg } 2475455Sdg /* 2485455Sdg * and if we have read-aheads, do them too 2495455Sdg */ 2505455Sdg if (rbp) { 2516621Sdg vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size; 2525455Sdg if (error || (rbp->b_flags & B_CACHE)) { 2531541Srgrimes rbp->b_flags &= ~(B_ASYNC | B_READ); 2541541Srgrimes brelse(rbp); 2555455Sdg } else { 25610541Sdyson if ((rbp->b_flags & B_CLUSTER) == 0) 25710541Sdyson vfs_busy_pages(rbp, 0); 2581541Srgrimes (void) VOP_STRATEGY(rbp); 2595455Sdg totreads++; 2605455Sdg totreadblocks += rbp->b_bcount / size; 2615455Sdg curproc->p_stats->p_ru.ru_inblock++; 2625455Sdg } 2635455Sdg } 2645839Sdg if (bp && ((bp->b_flags & B_ASYNC) == 0)) 2655455Sdg return (biowait(bp)); 2665455Sdg return (error); 2671541Srgrimes} 2681541Srgrimes 2691541Srgrimes/* 2701541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 2711541Srgrimes * read ahead. We will read as many blocks as possible sequentially 2721541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 2731541Srgrimes */ 27410541Sdysonstatic struct buf * 27510541Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run) 2761541Srgrimes struct vnode *vp; 2771541Srgrimes u_quad_t filesize; 2781541Srgrimes daddr_t lbn; 2791541Srgrimes daddr_t blkno; 2801541Srgrimes long size; 2811541Srgrimes int run; 2821541Srgrimes{ 2831541Srgrimes struct cluster_save *b_save; 28410541Sdyson struct buf *bp, *tbp; 2851541Srgrimes daddr_t bn; 2865455Sdg int i, inc, j; 2871541Srgrimes 2881541Srgrimes#ifdef DIAGNOSTIC 2891541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 2901541Srgrimes panic("cluster_rbuild: size %d != filesize %d\n", 2915455Sdg size, vp->v_mount->mnt_stat.f_iosize); 2921541Srgrimes#endif 2931541Srgrimes if (size * (lbn + run + 1) > filesize) 2941541Srgrimes --run; 29510541Sdyson 29610541Sdyson tbp = getblk(vp, lbn, size, 0, 0); 29710541Sdyson if (tbp->b_flags & B_CACHE) 29810541Sdyson return tbp; 29910541Sdyson 30010541Sdyson tbp->b_blkno = blkno; 30110541Sdyson tbp->b_flags |= B_ASYNC | B_READ; 30210541Sdyson if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 30310541Sdyson return tbp; 30410541Sdyson 30510541Sdyson bp = trypbuf(); 30610541Sdyson if (bp == 0) 30710541Sdyson return tbp; 30810541Sdyson 30910541Sdyson (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; 31010541Sdyson bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 3115455Sdg bp->b_iodone = cluster_callback; 3125455Sdg bp->b_blkno = blkno; 3135455Sdg bp->b_lblkno = lbn; 3145455Sdg pbgetvp(vp, bp); 3151541Srgrimes 31610541Sdyson b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 3171541Srgrimes M_SEGMENT, M_WAITOK); 3181541Srgrimes b_save->bs_nchildren = 0; 3195455Sdg b_save->bs_children = (struct buf **) (b_save + 1); 3205455Sdg bp->b_saveaddr = b_save; 3211541Srgrimes 3225455Sdg bp->b_bcount = 0; 3235455Sdg bp->b_bufsize = 0; 3245455Sdg bp->b_npages = 0; 3255455Sdg 3261541Srgrimes inc = btodb(size); 32710541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3285455Sdg if (i != 0) { 32910541Sdyson if ((bp->b_npages * PAGE_SIZE) + size > MAXPHYS) 33010541Sdyson break; 33110541Sdyson if (incore(vp, lbn + i)) 33210541Sdyson break; 3335455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 33410541Sdyson 3355455Sdg if ((tbp->b_flags & B_CACHE) || 33610541Sdyson (tbp->b_flags & B_VMIO) == 0) { 3375455Sdg brelse(tbp); 3385455Sdg break; 3395455Sdg } 34010541Sdyson 34110541Sdyson for (j=0;j<tbp->b_npages;j++) { 34210541Sdyson if (tbp->b_pages[j]->valid) { 34310541Sdyson break; 34410541Sdyson } 34510541Sdyson } 34610541Sdyson 34710541Sdyson if (j != tbp->b_npages) { 34810541Sdyson brelse(tbp); 34910541Sdyson break; 35010541Sdyson } 35110541Sdyson 35210541Sdyson tbp->b_flags |= B_READ | B_ASYNC; 35310541Sdyson if( tbp->b_blkno == tbp->b_lblkno) { 35410541Sdyson tbp->b_blkno = bn; 35510541Sdyson } else if (tbp->b_blkno != bn) { 35610541Sdyson brelse(tbp); 35710541Sdyson break; 35810541Sdyson } 3591541Srgrimes } 3601541Srgrimes ++b_save->bs_nchildren; 3615455Sdg b_save->bs_children[i] = tbp; 3625455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 36310541Sdyson vm_page_t m; 36410541Sdyson m = tbp->b_pages[j]; 36510541Sdyson ++m->busy; 36610541Sdyson ++m->object->paging_in_progress; 36710541Sdyson if (m->valid == VM_PAGE_BITS_ALL) { 36810541Sdyson m = bogus_page; 36910541Sdyson } 37010541Sdyson if ((bp->b_npages == 0) || 37110541Sdyson (bp->b_pages[bp->b_npages - 1] != m)) { 37210541Sdyson bp->b_pages[bp->b_npages] = m; 37310541Sdyson bp->b_npages++; 37410541Sdyson } 3751541Srgrimes } 37610541Sdyson bp->b_bcount += tbp->b_bcount; 37710541Sdyson bp->b_bufsize += tbp->b_bufsize; 3781541Srgrimes } 37910541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 38010541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 3815455Sdg return (bp); 3821541Srgrimes} 3831541Srgrimes 3841541Srgrimes/* 3851541Srgrimes * Cleanup after a clustered read or write. 3861541Srgrimes * This is complicated by the fact that any of the buffers might have 3871541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 3881541Srgrimes * that we will need to shift around. 3891541Srgrimes */ 3901541Srgrimesvoid 3911541Srgrimescluster_callback(bp) 3921541Srgrimes struct buf *bp; 3931541Srgrimes{ 3941541Srgrimes struct cluster_save *b_save; 3951541Srgrimes struct buf **bpp, *tbp; 3961541Srgrimes caddr_t cp; 3971541Srgrimes int error = 0; 3981541Srgrimes 3991541Srgrimes /* 4001541Srgrimes * Must propogate errors to all the components. 4011541Srgrimes */ 4021541Srgrimes if (bp->b_flags & B_ERROR) 4031541Srgrimes error = bp->b_error; 4041541Srgrimes 4055455Sdg b_save = (struct cluster_save *) (bp->b_saveaddr); 40610541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 4071541Srgrimes /* 4081541Srgrimes * Move memory from the large cluster buffer into the component 4091541Srgrimes * buffers and mark IO as done on these. 4101541Srgrimes */ 4111541Srgrimes for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 4121541Srgrimes tbp = *bpp; 4131541Srgrimes if (error) { 4141541Srgrimes tbp->b_flags |= B_ERROR; 4151541Srgrimes tbp->b_error = error; 4161541Srgrimes } 4171541Srgrimes biodone(tbp); 4181541Srgrimes } 4191541Srgrimes free(b_save, M_SEGMENT); 4205455Sdg relpbuf(bp); 4211541Srgrimes} 4221541Srgrimes 4231541Srgrimes/* 4241541Srgrimes * Do clustered write for FFS. 4251541Srgrimes * 4261541Srgrimes * Three cases: 4271541Srgrimes * 1. Write is not sequential (write asynchronously) 4281541Srgrimes * Write is sequential: 4291541Srgrimes * 2. beginning of cluster - begin cluster 4301541Srgrimes * 3. middle of a cluster - add to cluster 4311541Srgrimes * 4. end of a cluster - asynchronously write cluster 4321541Srgrimes */ 4331541Srgrimesvoid 4341541Srgrimescluster_write(bp, filesize) 4355455Sdg struct buf *bp; 4361541Srgrimes u_quad_t filesize; 4371541Srgrimes{ 4385455Sdg struct vnode *vp; 4395455Sdg daddr_t lbn; 4405455Sdg int maxclen, cursize; 4415455Sdg int lblocksize; 4421541Srgrimes 4435455Sdg vp = bp->b_vp; 4445455Sdg lblocksize = vp->v_mount->mnt_stat.f_iosize; 4455455Sdg lbn = bp->b_lblkno; 4461541Srgrimes 4471541Srgrimes /* Initialize vnode to beginning of file. */ 4481541Srgrimes if (lbn == 0) 4491541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 4501541Srgrimes 4515455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 4525455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 4535839Sdg maxclen = MAXPHYS / lblocksize - 1; 4541541Srgrimes if (vp->v_clen != 0) { 4551541Srgrimes /* 4561541Srgrimes * Next block is not sequential. 4578876Srgrimes * 4581541Srgrimes * If we are not writing at end of file, the process 4595455Sdg * seeked to another point in the file since its last 4605455Sdg * write, or we have reached our maximum cluster size, 4615455Sdg * then push the previous cluster. Otherwise try 4625455Sdg * reallocating to make it sequential. 4631541Srgrimes */ 4641541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 46510541Sdyson if (!doreallocblks || 46610541Sdyson (lbn + 1) * lblocksize != filesize || 46710541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 46810541Sdyson cluster_wbuild(vp, NULL, lblocksize, 46910541Sdyson vp->v_cstart, cursize, lbn); 47010541Sdyson } else { 47110541Sdyson struct buf **bpp, **endbp; 47210541Sdyson struct cluster_save *buflist; 47310541Sdyson 47410541Sdyson buflist = cluster_collectbufs(vp, bp); 47510541Sdyson endbp = &buflist->bs_children 47610541Sdyson [buflist->bs_nchildren - 1]; 47710541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 47810541Sdyson /* 47910541Sdyson * Failed, push the previous cluster. 48010541Sdyson */ 48110541Sdyson for (bpp = buflist->bs_children; 48210541Sdyson bpp < endbp; bpp++) 48310541Sdyson brelse(*bpp); 48410541Sdyson free(buflist, M_SEGMENT); 48510541Sdyson cluster_wbuild(vp, NULL, lblocksize, 48610541Sdyson vp->v_cstart, cursize, lbn); 48710541Sdyson } else { 48810541Sdyson /* 48910541Sdyson * Succeeded, keep building cluster. 49010541Sdyson */ 49110541Sdyson for (bpp = buflist->bs_children; 49210541Sdyson bpp <= endbp; bpp++) 49310541Sdyson bdwrite(*bpp); 49410541Sdyson free(buflist, M_SEGMENT); 49510541Sdyson vp->v_lastw = lbn; 49610541Sdyson vp->v_lasta = bp->b_blkno; 49710541Sdyson return; 49810541Sdyson } 49910541Sdyson } 5001541Srgrimes } 5011541Srgrimes /* 5025455Sdg * Consider beginning a cluster. If at end of file, make 5035455Sdg * cluster as large as possible, otherwise find size of 5045455Sdg * existing cluster. 5051541Srgrimes */ 5065455Sdg if ((lbn + 1) * lblocksize != filesize && 5077613Sdg (bp->b_blkno == bp->b_lblkno) && 50810541Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 50910541Sdyson bp->b_blkno == -1)) { 5101541Srgrimes bawrite(bp); 5111541Srgrimes vp->v_clen = 0; 5121541Srgrimes vp->v_lasta = bp->b_blkno; 5131541Srgrimes vp->v_cstart = lbn + 1; 5141541Srgrimes vp->v_lastw = lbn; 5151541Srgrimes return; 5161541Srgrimes } 5175455Sdg vp->v_clen = maxclen; 5185455Sdg if (maxclen == 0) { /* I/O not contiguous */ 5191541Srgrimes vp->v_cstart = lbn + 1; 5205455Sdg bawrite(bp); 5215455Sdg } else { /* Wait for rest of cluster */ 5221541Srgrimes vp->v_cstart = lbn; 5235455Sdg bdwrite(bp); 5241541Srgrimes } 5251541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 5261541Srgrimes /* 5271541Srgrimes * At end of cluster, write it out. 5281541Srgrimes */ 5291541Srgrimes cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 5301541Srgrimes vp->v_clen + 1, lbn); 5311541Srgrimes vp->v_clen = 0; 5321541Srgrimes vp->v_cstart = lbn + 1; 5331541Srgrimes } else 5341541Srgrimes /* 5355455Sdg * In the middle of a cluster, so just delay the I/O for now. 5361541Srgrimes */ 5371541Srgrimes bdwrite(bp); 5381541Srgrimes vp->v_lastw = lbn; 5391541Srgrimes vp->v_lasta = bp->b_blkno; 5401541Srgrimes} 5411541Srgrimes 5421541Srgrimes 5431541Srgrimes/* 5441541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 5451541Srgrimes * The last lbn argument is the current block on which I/O is being 5461541Srgrimes * performed. Check to see that it doesn't fall in the middle of 5471541Srgrimes * the current block (if last_bp == NULL). 5481541Srgrimes */ 5491541Srgrimesvoid 5501541Srgrimescluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 5511541Srgrimes struct vnode *vp; 5521541Srgrimes struct buf *last_bp; 5531541Srgrimes long size; 5541541Srgrimes daddr_t start_lbn; 5551541Srgrimes int len; 5565455Sdg daddr_t lbn; 5571541Srgrimes{ 5581541Srgrimes struct cluster_save *b_save; 5595839Sdg struct buf *bp, *tbp, *pb; 5605455Sdg caddr_t cp; 5615455Sdg int i, j, s; 5621541Srgrimes 5631541Srgrimes#ifdef DIAGNOSTIC 5641541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 5651541Srgrimes panic("cluster_wbuild: size %d != filesize %d\n", 5665455Sdg size, vp->v_mount->mnt_stat.f_iosize); 5671541Srgrimes#endif 5681541Srgrimesredo: 5696837Sdg if( (lbn != -1) || (last_bp == 0)) { 5706837Sdg while ((!(tbp = incore(vp, start_lbn)) || (tbp->b_flags & B_BUSY) 5716837Sdg || (start_lbn == lbn)) && len) { 5726837Sdg ++start_lbn; 5736837Sdg --len; 5746837Sdg } 5751541Srgrimes 5767090Sbde pb = trypbuf(); 5776837Sdg /* Get more memory for current buffer */ 5787164Sdg if (len <= 1 || pb == NULL) { 5797164Sdg if (pb != NULL) 5807164Sdg relpbuf(pb); 5816837Sdg if (last_bp) { 5826837Sdg bawrite(last_bp); 5836837Sdg } else if (len) { 5846837Sdg bp = getblk(vp, start_lbn, size, 0, 0); 5856837Sdg bawrite(bp); 5866837Sdg } 5876837Sdg return; 5881541Srgrimes } 5896837Sdg tbp = getblk(vp, start_lbn, size, 0, 0); 5906837Sdg } else { 5916837Sdg tbp = last_bp; 5926837Sdg if( tbp->b_flags & B_BUSY) { 5936837Sdg printf("vfs_cluster: warning: buffer already busy\n"); 5946837Sdg } 5956837Sdg tbp->b_flags |= B_BUSY; 5966837Sdg last_bp = 0; 5977090Sbde pb = trypbuf(); 5987164Sdg if (pb == NULL) { 5996837Sdg bawrite(tbp); 6006837Sdg return; 6016837Sdg } 6021541Srgrimes } 6036837Sdg 6045455Sdg if (!(tbp->b_flags & B_DELWRI)) { 6055839Sdg relpbuf(pb); 6061541Srgrimes ++start_lbn; 6071541Srgrimes --len; 6085455Sdg brelse(tbp); 6091541Srgrimes goto redo; 6101541Srgrimes } 6111541Srgrimes /* 6125455Sdg * Extra memory in the buffer, punt on this buffer. XXX we could 6135455Sdg * handle this in most cases, but we would have to push the extra 6145455Sdg * memory down to after our max possible cluster size and then 6155455Sdg * potentially pull it back up if the cluster was terminated 6165455Sdg * prematurely--too much hassle. 6171541Srgrimes */ 6185455Sdg if (tbp->b_bcount != tbp->b_bufsize) { 6195839Sdg relpbuf(pb); 6201541Srgrimes ++start_lbn; 6211541Srgrimes --len; 6225455Sdg bawrite(tbp); 6231541Srgrimes goto redo; 6241541Srgrimes } 6255839Sdg bp = pb; 6265455Sdg b_save = malloc(sizeof(struct buf *) * (len + 1) + sizeof(struct cluster_save), 6271541Srgrimes M_SEGMENT, M_WAITOK); 6281541Srgrimes b_save->bs_nchildren = 0; 6295455Sdg b_save->bs_children = (struct buf **) (b_save + 1); 6305455Sdg bp->b_saveaddr = b_save; 6315455Sdg bp->b_bcount = 0; 6325455Sdg bp->b_bufsize = 0; 6335455Sdg bp->b_npages = 0; 6341541Srgrimes 6355455Sdg if (tbp->b_flags & B_VMIO) 6365455Sdg bp->b_flags |= B_VMIO; 6375455Sdg 6385455Sdg bp->b_blkno = tbp->b_blkno; 6395455Sdg bp->b_lblkno = tbp->b_lblkno; 64010541Sdyson (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; 6415455Sdg bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER; 6421541Srgrimes bp->b_iodone = cluster_callback; 6435455Sdg pbgetvp(vp, bp); 6441541Srgrimes 6455455Sdg for (i = 0; i < len; ++i, ++start_lbn) { 6465455Sdg if (i != 0) { 6475455Sdg /* 6485455Sdg * Block is not in core or the non-sequential block 6495455Sdg * ending our cluster was part of the cluster (in 6505455Sdg * which case we don't want to write it twice). 6515455Sdg */ 6525455Sdg if (!(tbp = incore(vp, start_lbn)) || 6535455Sdg (last_bp == NULL && start_lbn == lbn)) 6541541Srgrimes break; 6551541Srgrimes 6565839Sdg if ((tbp->b_flags & (B_INVAL | B_CLUSTEROK)) != B_CLUSTEROK) 6575455Sdg break; 6581541Srgrimes 6596837Sdg if ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE)) 6606837Sdg break; 6616837Sdg 66210541Sdyson if ( (tbp->b_blkno != tbp->b_lblkno) && 66310541Sdyson ((bp->b_blkno + btodb(size) * i) != tbp->b_blkno)) 66410541Sdyson break; 66510541Sdyson 6665455Sdg /* 6675455Sdg * Get the desired block buffer (unless it is the 6685455Sdg * final sequential block whose buffer was passed in 6695455Sdg * explictly as last_bp). 6705455Sdg */ 6715455Sdg if (last_bp == NULL || start_lbn != lbn) { 6725839Sdg if( tbp->b_flags & B_BUSY) 6735839Sdg break; 6745455Sdg tbp = getblk(vp, start_lbn, size, 0, 0); 6755455Sdg if (!(tbp->b_flags & B_DELWRI) || 6765455Sdg ((tbp->b_flags & B_VMIO) != (bp->b_flags & B_VMIO))) { 6775455Sdg brelse(tbp); 6785455Sdg break; 6795455Sdg } 6805455Sdg } else 6815455Sdg tbp = last_bp; 6821541Srgrimes } 6835455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 68410541Sdyson vm_page_t m; 68510541Sdyson m = tbp->b_pages[j]; 68610541Sdyson ++m->busy; 68710541Sdyson ++m->object->paging_in_progress; 68810541Sdyson if ((bp->b_npages == 0) || 68910541Sdyson (bp->b_pages[bp->b_npages - 1] != m)) { 69010541Sdyson bp->b_pages[bp->b_npages] = m; 69110541Sdyson bp->b_npages++; 69210541Sdyson } 6935455Sdg } 6941541Srgrimes bp->b_bcount += size; 6951541Srgrimes bp->b_bufsize += size; 6961541Srgrimes 6971541Srgrimes tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 6981937Sdg tbp->b_flags |= B_ASYNC; 6991541Srgrimes s = splbio(); 7005455Sdg reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 7011541Srgrimes ++tbp->b_vp->v_numoutput; 7021541Srgrimes splx(s); 7031541Srgrimes b_save->bs_children[i] = tbp; 7041541Srgrimes } 7055455Sdg b_save->bs_nchildren = i; 70610541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 70710541Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 7085455Sdg bawrite(bp); 7091541Srgrimes 7101541Srgrimes if (i < len) { 7115455Sdg len -= i; 7121541Srgrimes goto redo; 7131541Srgrimes } 7141541Srgrimes} 7151541Srgrimes 7161541Srgrimes/* 7171541Srgrimes * Collect together all the buffers in a cluster. 7181541Srgrimes * Plus add one additional buffer. 7191541Srgrimes */ 7201541Srgrimesstruct cluster_save * 7211541Srgrimescluster_collectbufs(vp, last_bp) 7221541Srgrimes struct vnode *vp; 7231541Srgrimes struct buf *last_bp; 7241541Srgrimes{ 7251541Srgrimes struct cluster_save *buflist; 7265455Sdg daddr_t lbn; 7271541Srgrimes int i, len; 7281541Srgrimes 7291541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 7301541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 7311541Srgrimes M_SEGMENT, M_WAITOK); 7321541Srgrimes buflist->bs_nchildren = 0; 7335455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 7341541Srgrimes for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 7355455Sdg (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, 7365455Sdg &buflist->bs_children[i]); 7371541Srgrimes buflist->bs_children[i] = last_bp; 7381541Srgrimes buflist->bs_nchildren = i + 1; 7391541Srgrimes return (buflist); 7401541Srgrimes} 741