vfs_cluster.c revision 12413
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 3612413Sdyson * $Id: vfs_cluster.c,v 1.27 1995/11/20 03:55:48 dyson Exp $ 371541Srgrimes */ 381541Srgrimes 391541Srgrimes#include <sys/param.h> 401549Srgrimes#include <sys/systm.h> 411541Srgrimes#include <sys/proc.h> 421541Srgrimes#include <sys/buf.h> 431541Srgrimes#include <sys/vnode.h> 441541Srgrimes#include <sys/mount.h> 451541Srgrimes#include <sys/malloc.h> 461541Srgrimes#include <sys/resourcevar.h> 475455Sdg#include <sys/vmmeter.h> 485455Sdg#include <miscfs/specfs/specdev.h> 496621Sdg#include <vm/vm.h> 5010541Sdyson#include <vm/vm_object.h> 5110541Sdyson#include <vm/vm_page.h> 521541Srgrimes 531541Srgrimes#ifdef DEBUG 541541Srgrimes#include <vm/vm.h> 551541Srgrimes#include <sys/sysctl.h> 563055Sdgint doreallocblks = 0; 5712283SphkSYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, ""); 585455Sdg 591541Srgrimes#else 601541Srgrimes/* XXX for cluster_write */ 613055Sdg#define doreallocblks 0 621541Srgrimes#endif 631541Srgrimes 641541Srgrimes/* 651541Srgrimes * Local declarations 661541Srgrimes */ 6710541Sdysonstatic struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, 6810541Sdyson daddr_t, daddr_t, long, int)); 691541Srgrimesstruct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 701541Srgrimes 715455Sdgint totreads; 725455Sdgint totreadblocks; 7310541Sdysonextern vm_page_t bogus_page; 745455Sdg 751541Srgrimes#ifdef DIAGNOSTIC 761541Srgrimes/* 771541Srgrimes * Set to 1 if reads of block zero should cause readahead to be done. 781541Srgrimes * Set to 0 treats a read of block zero as a non-sequential read. 791541Srgrimes * 801541Srgrimes * Setting to one assumes that most reads of block zero of files are due to 811541Srgrimes * sequential passes over the files (e.g. cat, sum) where additional blocks 821541Srgrimes * will soon be needed. Setting to zero assumes that the majority are 831541Srgrimes * surgical strikes to get particular info (e.g. size, file) where readahead 841541Srgrimes * blocks will not be used and, in fact, push out other potentially useful 851541Srgrimes * blocks from the cache. The former seems intuitive, but some quick tests 861541Srgrimes * showed that the latter performed better from a system-wide point of view. 871541Srgrimes */ 885455Sdg int doclusterraz = 0; 895455Sdg 901541Srgrimes#define ISSEQREAD(vp, blk) \ 911541Srgrimes (((blk) != 0 || doclusterraz) && \ 921541Srgrimes ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 931541Srgrimes#else 941541Srgrimes#define ISSEQREAD(vp, blk) \ 955839Sdg (/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 961541Srgrimes#endif 971541Srgrimes 981541Srgrimes/* 9910541Sdyson * allow for three entire read-aheads... The system will 10010541Sdyson * adjust downwards rapidly if needed... 10110541Sdyson */ 10210541Sdyson#define RA_MULTIPLE_FAST 2 10310541Sdyson#define RA_MULTIPLE_SLOW 3 10410541Sdyson#define RA_SHIFTDOWN 1 /* approx lg2(RA_MULTIPLE) */ 10510541Sdyson/* 1061541Srgrimes * This replaces bread. If this is a bread at the beginning of a file and 1071541Srgrimes * lastr is 0, we assume this is the first read and we'll read up to two 1081541Srgrimes * blocks if they are sequential. After that, we'll do regular read ahead 1091541Srgrimes * in clustered chunks. 1101541Srgrimes * bp is the block requested. 1111541Srgrimes * rbp is the read-ahead block. 1121541Srgrimes * If either is NULL, then you don't have to do the I/O. 1131541Srgrimes */ 1141549Srgrimesint 1151541Srgrimescluster_read(vp, filesize, lblkno, size, cred, bpp) 1161541Srgrimes struct vnode *vp; 1171541Srgrimes u_quad_t filesize; 1181541Srgrimes daddr_t lblkno; 1191541Srgrimes long size; 1201541Srgrimes struct ucred *cred; 1211541Srgrimes struct buf **bpp; 1221541Srgrimes{ 1231541Srgrimes struct buf *bp, *rbp; 1245455Sdg daddr_t blkno, rablkno, origlblkno; 1251541Srgrimes int error, num_ra, alreadyincore; 12610541Sdyson int i; 12710541Sdyson int seq; 1281541Srgrimes 1291541Srgrimes error = 0; 1305455Sdg /* 1315455Sdg * get the requested block 1325455Sdg */ 13310541Sdyson origlblkno = lblkno; 1341541Srgrimes *bpp = bp = getblk(vp, lblkno, size, 0, 0); 13510541Sdyson seq = ISSEQREAD(vp, lblkno); 1365455Sdg /* 1375455Sdg * if it is in the cache, then check to see if the reads have been 1385455Sdg * sequential. If they have, then try some read-ahead, otherwise 1395455Sdg * back-off on prospective read-aheads. 1405455Sdg */ 1411541Srgrimes if (bp->b_flags & B_CACHE) { 14210541Sdyson if (!seq) { 1436621Sdg vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; 14410541Sdyson vp->v_ralen >>= RA_SHIFTDOWN; 1455455Sdg return 0; 14610541Sdyson } else if( vp->v_maxra > lblkno) { 14710541Sdyson if ( (vp->v_maxra + (vp->v_ralen / RA_MULTIPLE_SLOW)) >= (lblkno + vp->v_ralen)) { 14810541Sdyson if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST*(MAXPHYS / size)) 14910541Sdyson ++vp->v_ralen; 1505839Sdg return 0; 15110541Sdyson } 1526621Sdg lblkno = vp->v_maxra; 15310541Sdyson } else { 15410541Sdyson lblkno += 1; 1558876Srgrimes } 1565455Sdg bp = NULL; 1575455Sdg } else { 1581541Srgrimes /* 1595455Sdg * if it isn't in the cache, then get a chunk from disk if 1605455Sdg * sequential, otherwise just get the block. 1611541Srgrimes */ 1621541Srgrimes bp->b_flags |= B_READ; 1635455Sdg lblkno += 1; 1645455Sdg curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 16510541Sdyson vp->v_ralen = 0; 1661541Srgrimes } 1671541Srgrimes /* 1685455Sdg * assume no read-ahead 1695455Sdg */ 1705455Sdg alreadyincore = 1; 1715455Sdg rablkno = lblkno; 1725455Sdg 1735455Sdg /* 1745455Sdg * if we have been doing sequential I/O, then do some read-ahead 1755455Sdg */ 17610541Sdyson if (seq) { 1775455Sdg 17810541Sdyson /* 17910541Sdyson * bump ralen a bit... 18010541Sdyson */ 18110541Sdyson if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size)) 18210541Sdyson ++vp->v_ralen; 1831541Srgrimes /* 1845455Sdg * this code makes sure that the stuff that we have read-ahead 1855455Sdg * is still in the cache. If it isn't, we have been reading 1865455Sdg * ahead too much, and we need to back-off, otherwise we might 1875455Sdg * try to read more. 1881541Srgrimes */ 1895455Sdg for (i = 0; i < vp->v_ralen; i++) { 1905455Sdg rablkno = lblkno + i; 1915455Sdg alreadyincore = (int) incore(vp, rablkno); 1925455Sdg if (!alreadyincore) { 19310541Sdyson if (inmem(vp, rablkno)) { 19410541Sdyson if (vp->v_maxra < rablkno) 19510541Sdyson vp->v_maxra = rablkno + 1; 19610541Sdyson continue; 19710541Sdyson } 1985455Sdg if (rablkno < vp->v_maxra) { 1995455Sdg vp->v_maxra = rablkno; 20010541Sdyson vp->v_ralen >>= RA_SHIFTDOWN; 2015455Sdg alreadyincore = 1; 2025455Sdg } 2035455Sdg break; 20410541Sdyson } else if (vp->v_maxra < rablkno) { 2056621Sdg vp->v_maxra = rablkno + 1; 2065455Sdg } 2071541Srgrimes } 2085455Sdg } 2095455Sdg /* 2105455Sdg * we now build the read-ahead buffer if it is desirable. 2115455Sdg */ 2125455Sdg rbp = NULL; 2135455Sdg if (!alreadyincore && 2145455Sdg (rablkno + 1) * size <= filesize && 21510551Sdyson !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) && 2165455Sdg blkno != -1) { 2175455Sdg if (num_ra > vp->v_ralen) 2185455Sdg num_ra = vp->v_ralen; 2191541Srgrimes 2206621Sdg if (num_ra) { 22110541Sdyson rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size, 22210541Sdyson num_ra + 1); 2231541Srgrimes } else { 2245455Sdg rbp = getblk(vp, rablkno, size, 0, 0); 2255455Sdg rbp->b_flags |= B_READ | B_ASYNC; 2261541Srgrimes rbp->b_blkno = blkno; 2271541Srgrimes } 2281541Srgrimes } 2295839Sdg 2305455Sdg /* 23110541Sdyson * handle the synchronous read 2325455Sdg */ 2335455Sdg if (bp) { 2341541Srgrimes if (bp->b_flags & (B_DONE | B_DELWRI)) 2351541Srgrimes panic("cluster_read: DONE bp"); 2365455Sdg else { 2375455Sdg vfs_busy_pages(bp, 0); 2381541Srgrimes error = VOP_STRATEGY(bp); 2395455Sdg vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; 2405455Sdg totreads++; 2415455Sdg totreadblocks += bp->b_bcount / size; 2425455Sdg curproc->p_stats->p_ru.ru_inblock++; 2435455Sdg } 2445455Sdg } 2455455Sdg /* 2465455Sdg * and if we have read-aheads, do them too 2475455Sdg */ 2485455Sdg if (rbp) { 2496621Sdg vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size; 2505455Sdg if (error || (rbp->b_flags & B_CACHE)) { 2511541Srgrimes rbp->b_flags &= ~(B_ASYNC | B_READ); 2521541Srgrimes brelse(rbp); 2535455Sdg } else { 25410541Sdyson if ((rbp->b_flags & B_CLUSTER) == 0) 25510541Sdyson vfs_busy_pages(rbp, 0); 2561541Srgrimes (void) VOP_STRATEGY(rbp); 2575455Sdg totreads++; 2585455Sdg totreadblocks += rbp->b_bcount / size; 2595455Sdg curproc->p_stats->p_ru.ru_inblock++; 2605455Sdg } 2615455Sdg } 2625839Sdg if (bp && ((bp->b_flags & B_ASYNC) == 0)) 2635455Sdg return (biowait(bp)); 2645455Sdg return (error); 2651541Srgrimes} 2661541Srgrimes 2671541Srgrimes/* 2681541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 2691541Srgrimes * read ahead. We will read as many blocks as possible sequentially 2701541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 2711541Srgrimes */ 27210541Sdysonstatic struct buf * 27310541Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run) 2741541Srgrimes struct vnode *vp; 2751541Srgrimes u_quad_t filesize; 2761541Srgrimes daddr_t lbn; 2771541Srgrimes daddr_t blkno; 2781541Srgrimes long size; 2791541Srgrimes int run; 2801541Srgrimes{ 28110541Sdyson struct buf *bp, *tbp; 2821541Srgrimes daddr_t bn; 2835455Sdg int i, inc, j; 2841541Srgrimes 2851541Srgrimes#ifdef DIAGNOSTIC 2861541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 2871541Srgrimes panic("cluster_rbuild: size %d != filesize %d\n", 2885455Sdg size, vp->v_mount->mnt_stat.f_iosize); 2891541Srgrimes#endif 29010978Sdyson if (size * (lbn + run) > filesize) 2911541Srgrimes --run; 29210541Sdyson 29310541Sdyson tbp = getblk(vp, lbn, size, 0, 0); 29410541Sdyson if (tbp->b_flags & B_CACHE) 29510541Sdyson return tbp; 29610541Sdyson 29710541Sdyson tbp->b_blkno = blkno; 29810541Sdyson tbp->b_flags |= B_ASYNC | B_READ; 29910541Sdyson if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 30010541Sdyson return tbp; 30110541Sdyson 30210541Sdyson bp = trypbuf(); 30310541Sdyson if (bp == 0) 30410541Sdyson return tbp; 30510541Sdyson 30610541Sdyson (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; 30710541Sdyson bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 3085455Sdg bp->b_iodone = cluster_callback; 3095455Sdg bp->b_blkno = blkno; 3105455Sdg bp->b_lblkno = lbn; 3115455Sdg pbgetvp(vp, bp); 3121541Srgrimes 31312404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 3141541Srgrimes 3155455Sdg bp->b_bcount = 0; 3165455Sdg bp->b_bufsize = 0; 3175455Sdg bp->b_npages = 0; 3185455Sdg 3191541Srgrimes inc = btodb(size); 32010541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3215455Sdg if (i != 0) { 32210541Sdyson if ((bp->b_npages * PAGE_SIZE) + size > MAXPHYS) 32310541Sdyson break; 32410978Sdyson 32510541Sdyson if (incore(vp, lbn + i)) 32610541Sdyson break; 3275455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 32810541Sdyson 3295455Sdg if ((tbp->b_flags & B_CACHE) || 33010541Sdyson (tbp->b_flags & B_VMIO) == 0) { 3315455Sdg brelse(tbp); 3325455Sdg break; 3335455Sdg } 33410541Sdyson 33510541Sdyson for (j=0;j<tbp->b_npages;j++) { 33610541Sdyson if (tbp->b_pages[j]->valid) { 33710541Sdyson break; 33810541Sdyson } 33910541Sdyson } 34010541Sdyson 34110541Sdyson if (j != tbp->b_npages) { 34210978Sdyson /* 34310978Sdyson * force buffer to be re-constituted later 34410978Sdyson */ 34510978Sdyson tbp->b_flags |= B_RELBUF; 34610541Sdyson brelse(tbp); 34710541Sdyson break; 34810541Sdyson } 34910541Sdyson 35010541Sdyson tbp->b_flags |= B_READ | B_ASYNC; 35110541Sdyson if( tbp->b_blkno == tbp->b_lblkno) { 35210541Sdyson tbp->b_blkno = bn; 35310541Sdyson } else if (tbp->b_blkno != bn) { 35410541Sdyson brelse(tbp); 35510541Sdyson break; 35610541Sdyson } 3571541Srgrimes } 35812404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 35912404Sdyson tbp, b_cluster.cluster_entry); 3605455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 36110541Sdyson vm_page_t m; 36210541Sdyson m = tbp->b_pages[j]; 36310541Sdyson ++m->busy; 36410541Sdyson ++m->object->paging_in_progress; 36510978Sdyson if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) { 36610541Sdyson m = bogus_page; 36710541Sdyson } 36810541Sdyson if ((bp->b_npages == 0) || 36912413Sdyson (bp->b_pages[bp->b_npages-1] != m)) { 37010541Sdyson bp->b_pages[bp->b_npages] = m; 37110541Sdyson bp->b_npages++; 37210541Sdyson } 3731541Srgrimes } 37410541Sdyson bp->b_bcount += tbp->b_bcount; 37510541Sdyson bp->b_bufsize += tbp->b_bufsize; 3761541Srgrimes } 37710541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 37810541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 3795455Sdg return (bp); 3801541Srgrimes} 3811541Srgrimes 3821541Srgrimes/* 3831541Srgrimes * Cleanup after a clustered read or write. 3841541Srgrimes * This is complicated by the fact that any of the buffers might have 3851541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 3861541Srgrimes * that we will need to shift around. 3871541Srgrimes */ 3881541Srgrimesvoid 3891541Srgrimescluster_callback(bp) 3901541Srgrimes struct buf *bp; 3911541Srgrimes{ 39212404Sdyson struct buf *nbp, *tbp; 3931541Srgrimes int error = 0; 3941541Srgrimes 3951541Srgrimes /* 3961541Srgrimes * Must propogate errors to all the components. 3971541Srgrimes */ 3981541Srgrimes if (bp->b_flags & B_ERROR) 3991541Srgrimes error = bp->b_error; 4001541Srgrimes 40110541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 4021541Srgrimes /* 4031541Srgrimes * Move memory from the large cluster buffer into the component 4041541Srgrimes * buffers and mark IO as done on these. 4051541Srgrimes */ 40612404Sdyson for (tbp = bp->b_cluster.cluster_head.tqh_first; 40712404Sdyson tbp; tbp = nbp) { 40812404Sdyson nbp = tbp->b_cluster.cluster_entry.tqe_next; 4091541Srgrimes if (error) { 4101541Srgrimes tbp->b_flags |= B_ERROR; 4111541Srgrimes tbp->b_error = error; 4121541Srgrimes } 4131541Srgrimes biodone(tbp); 4141541Srgrimes } 4155455Sdg relpbuf(bp); 4161541Srgrimes} 4171541Srgrimes 4181541Srgrimes/* 4191541Srgrimes * Do clustered write for FFS. 4201541Srgrimes * 4211541Srgrimes * Three cases: 4221541Srgrimes * 1. Write is not sequential (write asynchronously) 4231541Srgrimes * Write is sequential: 4241541Srgrimes * 2. beginning of cluster - begin cluster 4251541Srgrimes * 3. middle of a cluster - add to cluster 4261541Srgrimes * 4. end of a cluster - asynchronously write cluster 4271541Srgrimes */ 4281541Srgrimesvoid 4291541Srgrimescluster_write(bp, filesize) 4305455Sdg struct buf *bp; 4311541Srgrimes u_quad_t filesize; 4321541Srgrimes{ 4335455Sdg struct vnode *vp; 4345455Sdg daddr_t lbn; 4355455Sdg int maxclen, cursize; 4365455Sdg int lblocksize; 43712404Sdyson int async; 4381541Srgrimes 4395455Sdg vp = bp->b_vp; 44012404Sdyson async = (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)); 4415455Sdg lblocksize = vp->v_mount->mnt_stat.f_iosize; 4425455Sdg lbn = bp->b_lblkno; 4431541Srgrimes 4441541Srgrimes /* Initialize vnode to beginning of file. */ 4451541Srgrimes if (lbn == 0) 4461541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 4471541Srgrimes 4485455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 4495455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 4505839Sdg maxclen = MAXPHYS / lblocksize - 1; 4511541Srgrimes if (vp->v_clen != 0) { 4521541Srgrimes /* 4531541Srgrimes * Next block is not sequential. 4548876Srgrimes * 4551541Srgrimes * If we are not writing at end of file, the process 4565455Sdg * seeked to another point in the file since its last 4575455Sdg * write, or we have reached our maximum cluster size, 4585455Sdg * then push the previous cluster. Otherwise try 4595455Sdg * reallocating to make it sequential. 4601541Srgrimes */ 4611541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 46212404Sdyson#if 1 46312404Sdyson if ((lbn + 1) * lblocksize != filesize || 46412404Sdyson lbn != vp->v_lastw + 1 || 46512404Sdyson vp->v_clen <= cursize) { 46612404Sdyson if (!async) 46712404Sdyson cluster_wbuild(vp, lblocksize, 46812404Sdyson vp->v_cstart, cursize); 46912404Sdyson } 47012404Sdyson#else 47110541Sdyson if (!doreallocblks || 47210541Sdyson (lbn + 1) * lblocksize != filesize || 47310541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 47412404Sdyson if (!async) 47512404Sdyson cluster_wbuild(vp, lblocksize, 47612404Sdyson vp->v_cstart, cursize); 47710541Sdyson } else { 47810541Sdyson struct buf **bpp, **endbp; 47910541Sdyson struct cluster_save *buflist; 48010541Sdyson 48110541Sdyson buflist = cluster_collectbufs(vp, bp); 48210541Sdyson endbp = &buflist->bs_children 48310541Sdyson [buflist->bs_nchildren - 1]; 48410541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 48510541Sdyson /* 48610541Sdyson * Failed, push the previous cluster. 48710541Sdyson */ 48810541Sdyson for (bpp = buflist->bs_children; 48910541Sdyson bpp < endbp; bpp++) 49010541Sdyson brelse(*bpp); 49110541Sdyson free(buflist, M_SEGMENT); 49212404Sdyson cluster_wbuild(vp, lblocksize, 49312404Sdyson vp->v_cstart, cursize); 49410541Sdyson } else { 49510541Sdyson /* 49610541Sdyson * Succeeded, keep building cluster. 49710541Sdyson */ 49810541Sdyson for (bpp = buflist->bs_children; 49910541Sdyson bpp <= endbp; bpp++) 50010541Sdyson bdwrite(*bpp); 50110541Sdyson free(buflist, M_SEGMENT); 50210541Sdyson vp->v_lastw = lbn; 50310541Sdyson vp->v_lasta = bp->b_blkno; 50410541Sdyson return; 50510541Sdyson } 50610541Sdyson } 50712404Sdyson#endif 5081541Srgrimes } 5091541Srgrimes /* 5105455Sdg * Consider beginning a cluster. If at end of file, make 5115455Sdg * cluster as large as possible, otherwise find size of 5125455Sdg * existing cluster. 5131541Srgrimes */ 5145455Sdg if ((lbn + 1) * lblocksize != filesize && 5157613Sdg (bp->b_blkno == bp->b_lblkno) && 51610551Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 51710541Sdyson bp->b_blkno == -1)) { 5181541Srgrimes bawrite(bp); 5191541Srgrimes vp->v_clen = 0; 5201541Srgrimes vp->v_lasta = bp->b_blkno; 5211541Srgrimes vp->v_cstart = lbn + 1; 5221541Srgrimes vp->v_lastw = lbn; 5231541Srgrimes return; 5241541Srgrimes } 5255455Sdg vp->v_clen = maxclen; 52612404Sdyson if (!async && maxclen == 0) { /* I/O not contiguous */ 5271541Srgrimes vp->v_cstart = lbn + 1; 5285455Sdg bawrite(bp); 5295455Sdg } else { /* Wait for rest of cluster */ 5301541Srgrimes vp->v_cstart = lbn; 5315455Sdg bdwrite(bp); 5321541Srgrimes } 5331541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 5341541Srgrimes /* 5351541Srgrimes * At end of cluster, write it out. 5361541Srgrimes */ 53712404Sdyson bdwrite(bp); 53812404Sdyson cluster_wbuild(vp, lblocksize, vp->v_cstart, 53912404Sdyson vp->v_clen + 1); 5401541Srgrimes vp->v_clen = 0; 5411541Srgrimes vp->v_cstart = lbn + 1; 5421541Srgrimes } else 5431541Srgrimes /* 5445455Sdg * In the middle of a cluster, so just delay the I/O for now. 5451541Srgrimes */ 5461541Srgrimes bdwrite(bp); 5471541Srgrimes vp->v_lastw = lbn; 5481541Srgrimes vp->v_lasta = bp->b_blkno; 5491541Srgrimes} 5501541Srgrimes 5511541Srgrimes 5521541Srgrimes/* 5531541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 5541541Srgrimes * The last lbn argument is the current block on which I/O is being 5551541Srgrimes * performed. Check to see that it doesn't fall in the middle of 5561541Srgrimes * the current block (if last_bp == NULL). 5571541Srgrimes */ 5581541Srgrimesvoid 55912404Sdysoncluster_wbuild(vp, size, start_lbn, len) 5601541Srgrimes struct vnode *vp; 5611541Srgrimes long size; 5621541Srgrimes daddr_t start_lbn; 5631541Srgrimes int len; 5641541Srgrimes{ 56512404Sdyson struct buf *bp, *tbp; 5665455Sdg int i, j, s; 56712404Sdyson int dbsize = btodb(size); 56812404Sdyson int origlen = len; 5691541Srgrimes 5701541Srgrimesredo: 57112404Sdyson if (len == 0) 57212404Sdyson return; 57312404Sdyson if ( ((tbp = incore(vp, start_lbn)) == NULL) || 57412404Sdyson ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { 57512404Sdyson ++start_lbn; 57612404Sdyson --len; 57712404Sdyson goto redo; 5781541Srgrimes } 5796837Sdg 58012404Sdyson tbp = getblk(vp, start_lbn, size, 0, 0); 58112404Sdyson if ((tbp->b_flags & B_DELWRI) == 0) { 5821541Srgrimes ++start_lbn; 5831541Srgrimes --len; 5845455Sdg brelse(tbp); 5851541Srgrimes goto redo; 5861541Srgrimes } 5871541Srgrimes /* 5885455Sdg * Extra memory in the buffer, punt on this buffer. XXX we could 5895455Sdg * handle this in most cases, but we would have to push the extra 5905455Sdg * memory down to after our max possible cluster size and then 5915455Sdg * potentially pull it back up if the cluster was terminated 5925455Sdg * prematurely--too much hassle. 5931541Srgrimes */ 59412404Sdyson if (((tbp->b_flags & (B_VMIO|B_CLUSTEROK)) != (B_VMIO|B_CLUSTEROK)) || 59512404Sdyson (tbp->b_bcount != tbp->b_bufsize) || 59612404Sdyson len == 1) { 59712404Sdyson bawrite(tbp); 5981541Srgrimes ++start_lbn; 5991541Srgrimes --len; 60012404Sdyson goto redo; 60112404Sdyson } 60212404Sdyson 60312404Sdyson bp = trypbuf(); 60412404Sdyson if (bp == NULL) { 6055455Sdg bawrite(tbp); 60612404Sdyson ++start_lbn; 60712404Sdyson --len; 6081541Srgrimes goto redo; 6091541Srgrimes } 61012404Sdyson 61112404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 6125455Sdg bp->b_bcount = 0; 6135455Sdg bp->b_bufsize = 0; 6145455Sdg bp->b_npages = 0; 6151541Srgrimes 6165455Sdg bp->b_blkno = tbp->b_blkno; 6175455Sdg bp->b_lblkno = tbp->b_lblkno; 61810541Sdyson (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; 6195455Sdg bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER; 6201541Srgrimes bp->b_iodone = cluster_callback; 6215455Sdg pbgetvp(vp, bp); 6221541Srgrimes 6235455Sdg for (i = 0; i < len; ++i, ++start_lbn) { 6245455Sdg if (i != 0) { 62512407Sdyson s = splbio(); 62612404Sdyson if ((tbp = incore(vp, start_lbn)) == NULL) { 62712404Sdyson splx(s); 6281541Srgrimes break; 62912404Sdyson } 6301541Srgrimes 63112404Sdyson if ((tbp->b_flags & (B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI)) != (B_DELWRI|B_CLUSTEROK)) { 63212404Sdyson splx(s); 6335455Sdg break; 63412404Sdyson } 6351541Srgrimes 63612404Sdyson if ((tbp->b_bcount != size) || 63712404Sdyson ((bp->b_blkno + dbsize * i) != tbp->b_blkno) || 63812404Sdyson ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) { 63912404Sdyson splx(s); 6406837Sdg break; 64112404Sdyson } 64212404Sdyson bremfree(tbp); 64312404Sdyson tbp->b_flags |= B_BUSY; 64412404Sdyson tbp->b_flags &= ~B_DONE; 64512404Sdyson splx(s); 6461541Srgrimes } 6475455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 64810541Sdyson vm_page_t m; 64910541Sdyson m = tbp->b_pages[j]; 65010541Sdyson ++m->busy; 65110541Sdyson ++m->object->paging_in_progress; 65210541Sdyson if ((bp->b_npages == 0) || 65310541Sdyson (bp->b_pages[bp->b_npages - 1] != m)) { 65410541Sdyson bp->b_pages[bp->b_npages] = m; 65510541Sdyson bp->b_npages++; 65610541Sdyson } 6575455Sdg } 6581541Srgrimes bp->b_bcount += size; 6591541Srgrimes bp->b_bufsize += size; 6601541Srgrimes 6611541Srgrimes tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 6621937Sdg tbp->b_flags |= B_ASYNC; 6631541Srgrimes s = splbio(); 6645455Sdg reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 6651541Srgrimes ++tbp->b_vp->v_numoutput; 6661541Srgrimes splx(s); 66712404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 66812404Sdyson tbp, b_cluster.cluster_entry); 6691541Srgrimes } 67010541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 67110541Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 6725455Sdg bawrite(bp); 6731541Srgrimes 67412404Sdyson len -= i; 67512404Sdyson goto redo; 6761541Srgrimes} 6771541Srgrimes 67812404Sdyson#if 0 6791541Srgrimes/* 6801541Srgrimes * Collect together all the buffers in a cluster. 6811541Srgrimes * Plus add one additional buffer. 6821541Srgrimes */ 6831541Srgrimesstruct cluster_save * 6841541Srgrimescluster_collectbufs(vp, last_bp) 6851541Srgrimes struct vnode *vp; 6861541Srgrimes struct buf *last_bp; 6871541Srgrimes{ 6881541Srgrimes struct cluster_save *buflist; 6895455Sdg daddr_t lbn; 6901541Srgrimes int i, len; 6911541Srgrimes 6921541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 6931541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 6941541Srgrimes M_SEGMENT, M_WAITOK); 6951541Srgrimes buflist->bs_nchildren = 0; 6965455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 6971541Srgrimes for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 6985455Sdg (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, 6995455Sdg &buflist->bs_children[i]); 7001541Srgrimes buflist->bs_children[i] = last_bp; 7011541Srgrimes buflist->bs_nchildren = i + 1; 7021541Srgrimes return (buflist); 7031541Srgrimes} 70412404Sdyson#endif 705