vfs_cluster.c revision 1937
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * Redistribution and use in source and binary forms, with or without 61541Srgrimes * modification, are permitted provided that the following conditions 71541Srgrimes * are met: 81541Srgrimes * 1. Redistributions of source code must retain the above copyright 91541Srgrimes * notice, this list of conditions and the following disclaimer. 101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer in the 121541Srgrimes * documentation and/or other materials provided with the distribution. 131541Srgrimes * 3. All advertising materials mentioning features or use of this software 141541Srgrimes * must display the following acknowledgement: 151541Srgrimes * This product includes software developed by the University of 161541Srgrimes * California, Berkeley and its contributors. 171541Srgrimes * 4. Neither the name of the University nor the names of its contributors 181541Srgrimes * may be used to endorse or promote products derived from this software 191541Srgrimes * without specific prior written permission. 201541Srgrimes * 211541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 221541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 231541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 241541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 251541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 261541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 271541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 281541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 291541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 301541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 311541Srgrimes * SUCH DAMAGE. 321541Srgrimes * 331541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 341937Sdg * $Id: vfs_cluster.c,v 1.3 1994/08/02 07:43:17 davidg Exp $ 351541Srgrimes */ 361541Srgrimes 371541Srgrimes#include <sys/param.h> 381549Srgrimes#include <sys/systm.h> 391541Srgrimes#include <sys/proc.h> 401541Srgrimes#include <sys/buf.h> 411541Srgrimes#include <sys/vnode.h> 421541Srgrimes#include <sys/mount.h> 431541Srgrimes#include <sys/trace.h> 441541Srgrimes#include <sys/malloc.h> 451541Srgrimes#include <sys/resourcevar.h> 461541Srgrimes 471541Srgrimes#ifdef DEBUG 481541Srgrimes#include <vm/vm.h> 491541Srgrimes#include <sys/sysctl.h> 501541Srgrimesint doreallocblks = 1; 511541Srgrimesstruct ctldebug debug13 = { "doreallocblks", &doreallocblks }; 521541Srgrimes#else 531541Srgrimes/* XXX for cluster_write */ 541541Srgrimes#define doreallocblks 1 551541Srgrimes#endif 561541Srgrimes 571541Srgrimes/* 581541Srgrimes * Local declarations 591541Srgrimes */ 601541Srgrimesstruct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 611541Srgrimes daddr_t, long, int)); 621541Srgrimesstruct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 631541Srgrimes daddr_t, daddr_t, long, int, long)); 641541Srgrimesvoid cluster_wbuild __P((struct vnode *, struct buf *, long, 651541Srgrimes daddr_t, int, daddr_t)); 661541Srgrimesstruct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 671541Srgrimes 681541Srgrimes#ifdef DIAGNOSTIC 691541Srgrimes/* 701541Srgrimes * Set to 1 if reads of block zero should cause readahead to be done. 711541Srgrimes * Set to 0 treats a read of block zero as a non-sequential read. 721541Srgrimes * 731541Srgrimes * Setting to one assumes that most reads of block zero of files are due to 741541Srgrimes * sequential passes over the files (e.g. cat, sum) where additional blocks 751541Srgrimes * will soon be needed. Setting to zero assumes that the majority are 761541Srgrimes * surgical strikes to get particular info (e.g. size, file) where readahead 771541Srgrimes * blocks will not be used and, in fact, push out other potentially useful 781541Srgrimes * blocks from the cache. The former seems intuitive, but some quick tests 791541Srgrimes * showed that the latter performed better from a system-wide point of view. 801541Srgrimes */ 811541Srgrimesint doclusterraz = 0; 821541Srgrimes#define ISSEQREAD(vp, blk) \ 831541Srgrimes (((blk) != 0 || doclusterraz) && \ 841541Srgrimes ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 851541Srgrimes#else 861541Srgrimes#define ISSEQREAD(vp, blk) \ 871541Srgrimes ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 881541Srgrimes#endif 891541Srgrimes 901541Srgrimes/* 911541Srgrimes * This replaces bread. If this is a bread at the beginning of a file and 921541Srgrimes * lastr is 0, we assume this is the first read and we'll read up to two 931541Srgrimes * blocks if they are sequential. After that, we'll do regular read ahead 941541Srgrimes * in clustered chunks. 951541Srgrimes * 961541Srgrimes * There are 4 or 5 cases depending on how you count: 971541Srgrimes * Desired block is in the cache: 981541Srgrimes * 1 Not sequential access (0 I/Os). 991541Srgrimes * 2 Access is sequential, do read-ahead (1 ASYNC). 1001541Srgrimes * Desired block is not in cache: 1011541Srgrimes * 3 Not sequential access (1 SYNC). 1021541Srgrimes * 4 Sequential access, next block is contiguous (1 SYNC). 1031541Srgrimes * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 1041541Srgrimes * 1051541Srgrimes * There are potentially two buffers that require I/O. 1061541Srgrimes * bp is the block requested. 1071541Srgrimes * rbp is the read-ahead block. 1081541Srgrimes * If either is NULL, then you don't have to do the I/O. 1091541Srgrimes */ 1101549Srgrimesint 1111541Srgrimescluster_read(vp, filesize, lblkno, size, cred, bpp) 1121541Srgrimes struct vnode *vp; 1131541Srgrimes u_quad_t filesize; 1141541Srgrimes daddr_t lblkno; 1151541Srgrimes long size; 1161541Srgrimes struct ucred *cred; 1171541Srgrimes struct buf **bpp; 1181541Srgrimes{ 1191541Srgrimes struct buf *bp, *rbp; 1201541Srgrimes daddr_t blkno, ioblkno; 1211541Srgrimes long flags; 1221541Srgrimes int error, num_ra, alreadyincore; 1231541Srgrimes 1241541Srgrimes#ifdef DIAGNOSTIC 1251541Srgrimes if (size == 0) 1261541Srgrimes panic("cluster_read: size = 0"); 1271541Srgrimes#endif 1281541Srgrimes 1291541Srgrimes error = 0; 1301541Srgrimes flags = B_READ; 1311541Srgrimes *bpp = bp = getblk(vp, lblkno, size, 0, 0); 1321541Srgrimes if (bp->b_flags & B_CACHE) { 1331541Srgrimes /* 1341541Srgrimes * Desired block is in cache; do any readahead ASYNC. 1351541Srgrimes * Case 1, 2. 1361541Srgrimes */ 1371541Srgrimes trace(TR_BREADHIT, pack(vp, size), lblkno); 1381541Srgrimes flags |= B_ASYNC; 1391541Srgrimes ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); 1401541Srgrimes alreadyincore = (int)incore(vp, ioblkno); 1411541Srgrimes bp = NULL; 1421541Srgrimes } else { 1431541Srgrimes /* Block wasn't in cache, case 3, 4, 5. */ 1441541Srgrimes trace(TR_BREADMISS, pack(vp, size), lblkno); 1451541Srgrimes bp->b_flags |= B_READ; 1461541Srgrimes ioblkno = lblkno; 1471541Srgrimes alreadyincore = 0; 1481541Srgrimes curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 1491541Srgrimes } 1501541Srgrimes /* 1511541Srgrimes * XXX 1521541Srgrimes * Replace 1 with a window size based on some permutation of 1531541Srgrimes * maxcontig and rot_delay. This will let you figure out how 1541541Srgrimes * many blocks you should read-ahead (case 2, 4, 5). 1551541Srgrimes * 1561541Srgrimes * If the access isn't sequential, reset the window to 1. 1571541Srgrimes * Note that a read to the same block is considered sequential. 1581541Srgrimes * This catches the case where the file is being read sequentially, 1591541Srgrimes * but at smaller than the filesystem block size. 1601541Srgrimes */ 1611541Srgrimes rbp = NULL; 1621541Srgrimes if (!ISSEQREAD(vp, lblkno)) { 1631541Srgrimes vp->v_ralen = 0; 1641541Srgrimes vp->v_maxra = lblkno; 1651541Srgrimes } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && 1661541Srgrimes !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 1671541Srgrimes blkno != -1) { 1681541Srgrimes /* 1691541Srgrimes * Reading sequentially, and the next block is not in the 1701541Srgrimes * cache. We are going to try reading ahead. 1711541Srgrimes */ 1721541Srgrimes if (num_ra) { 1731541Srgrimes /* 1741541Srgrimes * If our desired readahead block had been read 1751541Srgrimes * in a previous readahead but is no longer in 1761541Srgrimes * core, then we may be reading ahead too far 1771541Srgrimes * or are not using our readahead very rapidly. 1781541Srgrimes * In this case we scale back the window. 1791541Srgrimes */ 1801541Srgrimes if (!alreadyincore && ioblkno <= vp->v_maxra) 1811541Srgrimes vp->v_ralen = max(vp->v_ralen >> 1, 1); 1821541Srgrimes /* 1831541Srgrimes * There are more sequential blocks than our current 1841541Srgrimes * window allows, scale up. Ideally we want to get 1851541Srgrimes * in sync with the filesystem maxcontig value. 1861541Srgrimes */ 1871541Srgrimes else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) 1881541Srgrimes vp->v_ralen = vp->v_ralen ? 1891541Srgrimes min(num_ra, vp->v_ralen << 1) : 1; 1901541Srgrimes 1911541Srgrimes if (num_ra > vp->v_ralen) 1921541Srgrimes num_ra = vp->v_ralen; 1931541Srgrimes } 1941541Srgrimes 1951541Srgrimes if (num_ra) /* case 2, 4 */ 1961541Srgrimes rbp = cluster_rbuild(vp, filesize, 1971541Srgrimes bp, ioblkno, blkno, size, num_ra, flags); 1981541Srgrimes else if (ioblkno == lblkno) { 1991541Srgrimes bp->b_blkno = blkno; 2001541Srgrimes /* Case 5: check how many blocks to read ahead */ 2011541Srgrimes ++ioblkno; 2021541Srgrimes if ((ioblkno + 1) * size > filesize || 2031541Srgrimes incore(vp, ioblkno) || (error = VOP_BMAP(vp, 2041541Srgrimes ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) 2051541Srgrimes goto skip_readahead; 2061541Srgrimes /* 2071541Srgrimes * Adjust readahead as above 2081541Srgrimes */ 2091541Srgrimes if (num_ra) { 2101541Srgrimes if (!alreadyincore && ioblkno <= vp->v_maxra) 2111541Srgrimes vp->v_ralen = max(vp->v_ralen >> 1, 1); 2121541Srgrimes else if (num_ra > vp->v_ralen && 2131541Srgrimes lblkno != vp->v_lastr) 2141541Srgrimes vp->v_ralen = vp->v_ralen ? 2151541Srgrimes min(num_ra,vp->v_ralen<<1) : 1; 2161541Srgrimes if (num_ra > vp->v_ralen) 2171541Srgrimes num_ra = vp->v_ralen; 2181541Srgrimes } 2191541Srgrimes flags |= B_ASYNC; 2201541Srgrimes if (num_ra) 2211541Srgrimes rbp = cluster_rbuild(vp, filesize, 2221541Srgrimes NULL, ioblkno, blkno, size, num_ra, flags); 2231541Srgrimes else { 2241541Srgrimes rbp = getblk(vp, ioblkno, size, 0, 0); 2251541Srgrimes rbp->b_flags |= flags; 2261541Srgrimes rbp->b_blkno = blkno; 2271541Srgrimes } 2281541Srgrimes } else { 2291541Srgrimes /* case 2; read ahead single block */ 2301541Srgrimes rbp = getblk(vp, ioblkno, size, 0, 0); 2311541Srgrimes rbp->b_flags |= flags; 2321541Srgrimes rbp->b_blkno = blkno; 2331541Srgrimes } 2341541Srgrimes 2351541Srgrimes if (rbp == bp) /* case 4 */ 2361541Srgrimes rbp = NULL; 2371541Srgrimes else if (rbp) { /* case 2, 5 */ 2381541Srgrimes trace(TR_BREADMISSRA, 2391541Srgrimes pack(vp, (num_ra + 1) * size), ioblkno); 2401541Srgrimes curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 2411541Srgrimes } 2421541Srgrimes } 2431541Srgrimes 2441541Srgrimes /* XXX Kirk, do we need to make sure the bp has creds? */ 2451541Srgrimesskip_readahead: 2461541Srgrimes if (bp) 2471541Srgrimes if (bp->b_flags & (B_DONE | B_DELWRI)) 2481541Srgrimes panic("cluster_read: DONE bp"); 2491541Srgrimes else 2501541Srgrimes error = VOP_STRATEGY(bp); 2511541Srgrimes 2521541Srgrimes if (rbp) 2531541Srgrimes if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 2541541Srgrimes rbp->b_flags &= ~(B_ASYNC | B_READ); 2551541Srgrimes brelse(rbp); 2561541Srgrimes } else 2571541Srgrimes (void) VOP_STRATEGY(rbp); 2581541Srgrimes 2591541Srgrimes /* 2601541Srgrimes * Recalculate our maximum readahead 2611541Srgrimes */ 2621541Srgrimes if (rbp == NULL) 2631541Srgrimes rbp = bp; 2641541Srgrimes if (rbp) 2651541Srgrimes vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; 2661541Srgrimes 2671541Srgrimes if (bp) 2681541Srgrimes return(biowait(bp)); 2691541Srgrimes return(error); 2701541Srgrimes} 2711541Srgrimes 2721541Srgrimes/* 2731541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 2741541Srgrimes * read ahead. We will read as many blocks as possible sequentially 2751541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 2761541Srgrimes */ 2771541Srgrimesstruct buf * 2781541Srgrimescluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 2791541Srgrimes struct vnode *vp; 2801541Srgrimes u_quad_t filesize; 2811541Srgrimes struct buf *bp; 2821541Srgrimes daddr_t lbn; 2831541Srgrimes daddr_t blkno; 2841541Srgrimes long size; 2851541Srgrimes int run; 2861541Srgrimes long flags; 2871541Srgrimes{ 2881541Srgrimes struct cluster_save *b_save; 2891541Srgrimes struct buf *tbp; 2901541Srgrimes daddr_t bn; 2911541Srgrimes int i, inc; 2921541Srgrimes 2931541Srgrimes#ifdef DIAGNOSTIC 2941541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 2951541Srgrimes panic("cluster_rbuild: size %d != filesize %d\n", 2961541Srgrimes size, vp->v_mount->mnt_stat.f_iosize); 2971541Srgrimes#endif 2981541Srgrimes if (size * (lbn + run + 1) > filesize) 2991541Srgrimes --run; 3001541Srgrimes if (run == 0) { 3011541Srgrimes if (!bp) { 3021541Srgrimes bp = getblk(vp, lbn, size, 0, 0); 3031541Srgrimes bp->b_blkno = blkno; 3041541Srgrimes bp->b_flags |= flags; 3051541Srgrimes } 3061541Srgrimes return(bp); 3071541Srgrimes } 3081541Srgrimes 3091541Srgrimes bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 3101541Srgrimes if (bp->b_flags & (B_DONE | B_DELWRI)) 3111541Srgrimes return (bp); 3121541Srgrimes 3131541Srgrimes b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 3141541Srgrimes M_SEGMENT, M_WAITOK); 3151541Srgrimes b_save->bs_bufsize = b_save->bs_bcount = size; 3161541Srgrimes b_save->bs_nchildren = 0; 3171541Srgrimes b_save->bs_children = (struct buf **)(b_save + 1); 3181541Srgrimes b_save->bs_saveaddr = bp->b_saveaddr; 3191541Srgrimes bp->b_saveaddr = (caddr_t) b_save; 3201541Srgrimes 3211541Srgrimes inc = btodb(size); 3221541Srgrimes for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 3231541Srgrimes if (incore(vp, lbn + i)) { 3241541Srgrimes if (i == 1) { 3251541Srgrimes bp->b_saveaddr = b_save->bs_saveaddr; 3261541Srgrimes bp->b_flags &= ~B_CALL; 3271541Srgrimes bp->b_iodone = NULL; 3281541Srgrimes allocbuf(bp, size); 3291541Srgrimes free(b_save, M_SEGMENT); 3301541Srgrimes } else 3311541Srgrimes allocbuf(bp, size * i); 3321541Srgrimes break; 3331541Srgrimes } 3341541Srgrimes tbp = getblk(vp, lbn + i, 0, 0, 0); 3351541Srgrimes /* 3361541Srgrimes * getblk may return some memory in the buffer if there were 3371541Srgrimes * no empty buffers to shed it to. If there is currently 3381541Srgrimes * memory in the buffer, we move it down size bytes to make 3391541Srgrimes * room for the valid pages that cluster_callback will insert. 3401541Srgrimes * We do this now so we don't have to do it at interrupt time 3411541Srgrimes * in the callback routine. 3421541Srgrimes */ 3431541Srgrimes if (tbp->b_bufsize != 0) { 3441541Srgrimes caddr_t bdata = (char *)tbp->b_data; 3451541Srgrimes 3461541Srgrimes if (tbp->b_bufsize + size > MAXBSIZE) 3471541Srgrimes panic("cluster_rbuild: too much memory"); 3481541Srgrimes if (tbp->b_bufsize > size) { 3491541Srgrimes /* 3501541Srgrimes * XXX if the source and destination regions 3511541Srgrimes * overlap we have to copy backward to avoid 3521541Srgrimes * clobbering any valid pages (i.e. pagemove 3531541Srgrimes * implementations typically can't handle 3541541Srgrimes * overlap). 3551541Srgrimes */ 3561541Srgrimes bdata += tbp->b_bufsize; 3571541Srgrimes while (bdata > (char *)tbp->b_data) { 3581541Srgrimes bdata -= CLBYTES; 3591541Srgrimes pagemove(bdata, bdata + size, CLBYTES); 3601541Srgrimes } 3611541Srgrimes } else 3621541Srgrimes pagemove(bdata, bdata + size, tbp->b_bufsize); 3631541Srgrimes } 3641541Srgrimes tbp->b_blkno = bn; 3651541Srgrimes tbp->b_flags |= flags | B_READ | B_ASYNC; 3661541Srgrimes ++b_save->bs_nchildren; 3671541Srgrimes b_save->bs_children[i - 1] = tbp; 3681541Srgrimes } 3691541Srgrimes return(bp); 3701541Srgrimes} 3711541Srgrimes 3721541Srgrimes/* 3731541Srgrimes * Either get a new buffer or grow the existing one. 3741541Srgrimes */ 3751541Srgrimesstruct buf * 3761541Srgrimescluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 3771541Srgrimes struct vnode *vp; 3781541Srgrimes struct buf *bp; 3791541Srgrimes long flags; 3801541Srgrimes daddr_t blkno; 3811541Srgrimes daddr_t lblkno; 3821541Srgrimes long size; 3831541Srgrimes int run; 3841541Srgrimes{ 3851541Srgrimes if (!bp) { 3861541Srgrimes bp = getblk(vp, lblkno, size, 0, 0); 3871541Srgrimes if (bp->b_flags & (B_DONE | B_DELWRI)) { 3881541Srgrimes bp->b_blkno = blkno; 3891541Srgrimes return(bp); 3901541Srgrimes } 3911541Srgrimes } 3921541Srgrimes allocbuf(bp, run * size); 3931541Srgrimes bp->b_blkno = blkno; 3941541Srgrimes bp->b_iodone = cluster_callback; 3951541Srgrimes bp->b_flags |= flags | B_CALL; 3961541Srgrimes return(bp); 3971541Srgrimes} 3981541Srgrimes 3991541Srgrimes/* 4001541Srgrimes * Cleanup after a clustered read or write. 4011541Srgrimes * This is complicated by the fact that any of the buffers might have 4021541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 4031541Srgrimes * that we will need to shift around. 4041541Srgrimes */ 4051541Srgrimesvoid 4061541Srgrimescluster_callback(bp) 4071541Srgrimes struct buf *bp; 4081541Srgrimes{ 4091541Srgrimes struct cluster_save *b_save; 4101541Srgrimes struct buf **bpp, *tbp; 4111541Srgrimes long bsize; 4121541Srgrimes caddr_t cp; 4131541Srgrimes int error = 0; 4141541Srgrimes 4151541Srgrimes /* 4161541Srgrimes * Must propogate errors to all the components. 4171541Srgrimes */ 4181541Srgrimes if (bp->b_flags & B_ERROR) 4191541Srgrimes error = bp->b_error; 4201541Srgrimes 4211541Srgrimes b_save = (struct cluster_save *)(bp->b_saveaddr); 4221541Srgrimes bp->b_saveaddr = b_save->bs_saveaddr; 4231541Srgrimes 4241541Srgrimes bsize = b_save->bs_bufsize; 4251541Srgrimes cp = (char *)bp->b_data + bsize; 4261541Srgrimes /* 4271541Srgrimes * Move memory from the large cluster buffer into the component 4281541Srgrimes * buffers and mark IO as done on these. 4291541Srgrimes */ 4301541Srgrimes for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 4311541Srgrimes tbp = *bpp; 4321541Srgrimes pagemove(cp, tbp->b_data, bsize); 4331541Srgrimes tbp->b_bufsize += bsize; 4341541Srgrimes tbp->b_bcount = bsize; 4351541Srgrimes if (error) { 4361541Srgrimes tbp->b_flags |= B_ERROR; 4371541Srgrimes tbp->b_error = error; 4381541Srgrimes } 4391541Srgrimes biodone(tbp); 4401541Srgrimes bp->b_bufsize -= bsize; 4411541Srgrimes cp += bsize; 4421541Srgrimes } 4431541Srgrimes /* 4441541Srgrimes * If there was excess memory in the cluster buffer, 4451541Srgrimes * slide it up adjacent to the remaining valid data. 4461541Srgrimes */ 4471541Srgrimes if (bp->b_bufsize != bsize) { 4481541Srgrimes if (bp->b_bufsize < bsize) 4491541Srgrimes panic("cluster_callback: too little memory"); 4501541Srgrimes pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); 4511541Srgrimes } 4521541Srgrimes bp->b_bcount = bsize; 4531541Srgrimes bp->b_iodone = NULL; 4541541Srgrimes free(b_save, M_SEGMENT); 4551541Srgrimes if (bp->b_flags & B_ASYNC) 4561541Srgrimes brelse(bp); 4571541Srgrimes else { 4581541Srgrimes bp->b_flags &= ~B_WANTED; 4591541Srgrimes wakeup((caddr_t)bp); 4601541Srgrimes } 4611541Srgrimes} 4621541Srgrimes 4631541Srgrimes/* 4641541Srgrimes * Do clustered write for FFS. 4651541Srgrimes * 4661541Srgrimes * Three cases: 4671541Srgrimes * 1. Write is not sequential (write asynchronously) 4681541Srgrimes * Write is sequential: 4691541Srgrimes * 2. beginning of cluster - begin cluster 4701541Srgrimes * 3. middle of a cluster - add to cluster 4711541Srgrimes * 4. end of a cluster - asynchronously write cluster 4721541Srgrimes */ 4731541Srgrimesvoid 4741541Srgrimescluster_write(bp, filesize) 4751541Srgrimes struct buf *bp; 4761541Srgrimes u_quad_t filesize; 4771541Srgrimes{ 4781541Srgrimes struct vnode *vp; 4791541Srgrimes daddr_t lbn; 4801541Srgrimes int maxclen, cursize; 4811541Srgrimes 4821541Srgrimes vp = bp->b_vp; 4831541Srgrimes lbn = bp->b_lblkno; 4841541Srgrimes 4851541Srgrimes /* Initialize vnode to beginning of file. */ 4861541Srgrimes if (lbn == 0) 4871541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 4881541Srgrimes 4891541Srgrimes if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 4901541Srgrimes (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { 4911541Srgrimes maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 4921541Srgrimes if (vp->v_clen != 0) { 4931541Srgrimes /* 4941541Srgrimes * Next block is not sequential. 4951541Srgrimes * 4961541Srgrimes * If we are not writing at end of file, the process 4971541Srgrimes * seeked to another point in the file since its 4981541Srgrimes * last write, or we have reached our maximum 4991541Srgrimes * cluster size, then push the previous cluster. 5001541Srgrimes * Otherwise try reallocating to make it sequential. 5011541Srgrimes */ 5021541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 5031541Srgrimes if (!doreallocblks || 5041541Srgrimes (lbn + 1) * bp->b_bcount != filesize || 5051541Srgrimes lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 5061541Srgrimes cluster_wbuild(vp, NULL, bp->b_bcount, 5071541Srgrimes vp->v_cstart, cursize, lbn); 5081541Srgrimes } else { 5091541Srgrimes struct buf **bpp, **endbp; 5101541Srgrimes struct cluster_save *buflist; 5111541Srgrimes 5121541Srgrimes buflist = cluster_collectbufs(vp, bp); 5131541Srgrimes endbp = &buflist->bs_children 5141541Srgrimes [buflist->bs_nchildren - 1]; 5151541Srgrimes if (VOP_REALLOCBLKS(vp, buflist)) { 5161541Srgrimes /* 5171541Srgrimes * Failed, push the previous cluster. 5181541Srgrimes */ 5191541Srgrimes for (bpp = buflist->bs_children; 5201541Srgrimes bpp < endbp; bpp++) 5211541Srgrimes brelse(*bpp); 5221541Srgrimes free(buflist, M_SEGMENT); 5231541Srgrimes cluster_wbuild(vp, NULL, bp->b_bcount, 5241541Srgrimes vp->v_cstart, cursize, lbn); 5251541Srgrimes } else { 5261541Srgrimes /* 5271541Srgrimes * Succeeded, keep building cluster. 5281541Srgrimes */ 5291541Srgrimes for (bpp = buflist->bs_children; 5301541Srgrimes bpp <= endbp; bpp++) 5311541Srgrimes bdwrite(*bpp); 5321541Srgrimes free(buflist, M_SEGMENT); 5331541Srgrimes vp->v_lastw = lbn; 5341541Srgrimes vp->v_lasta = bp->b_blkno; 5351541Srgrimes return; 5361541Srgrimes } 5371541Srgrimes } 5381541Srgrimes } 5391541Srgrimes /* 5401541Srgrimes * Consider beginning a cluster. 5411541Srgrimes * If at end of file, make cluster as large as possible, 5421541Srgrimes * otherwise find size of existing cluster. 5431541Srgrimes */ 5441541Srgrimes if ((lbn + 1) * bp->b_bcount != filesize && 5451541Srgrimes (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || 5461541Srgrimes bp->b_blkno == -1)) { 5471541Srgrimes bawrite(bp); 5481541Srgrimes vp->v_clen = 0; 5491541Srgrimes vp->v_lasta = bp->b_blkno; 5501541Srgrimes vp->v_cstart = lbn + 1; 5511541Srgrimes vp->v_lastw = lbn; 5521541Srgrimes return; 5531541Srgrimes } 5541541Srgrimes vp->v_clen = maxclen; 5551541Srgrimes if (maxclen == 0) { /* I/O not contiguous */ 5561541Srgrimes vp->v_cstart = lbn + 1; 5571541Srgrimes bawrite(bp); 5581541Srgrimes } else { /* Wait for rest of cluster */ 5591541Srgrimes vp->v_cstart = lbn; 5601541Srgrimes bdwrite(bp); 5611541Srgrimes } 5621541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 5631541Srgrimes /* 5641541Srgrimes * At end of cluster, write it out. 5651541Srgrimes */ 5661541Srgrimes cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 5671541Srgrimes vp->v_clen + 1, lbn); 5681541Srgrimes vp->v_clen = 0; 5691541Srgrimes vp->v_cstart = lbn + 1; 5701541Srgrimes } else 5711541Srgrimes /* 5721541Srgrimes * In the middle of a cluster, so just delay the 5731541Srgrimes * I/O for now. 5741541Srgrimes */ 5751541Srgrimes bdwrite(bp); 5761541Srgrimes vp->v_lastw = lbn; 5771541Srgrimes vp->v_lasta = bp->b_blkno; 5781541Srgrimes} 5791541Srgrimes 5801541Srgrimes 5811541Srgrimes/* 5821541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 5831541Srgrimes * The last lbn argument is the current block on which I/O is being 5841541Srgrimes * performed. Check to see that it doesn't fall in the middle of 5851541Srgrimes * the current block (if last_bp == NULL). 5861541Srgrimes */ 5871541Srgrimesvoid 5881541Srgrimescluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 5891541Srgrimes struct vnode *vp; 5901541Srgrimes struct buf *last_bp; 5911541Srgrimes long size; 5921541Srgrimes daddr_t start_lbn; 5931541Srgrimes int len; 5941541Srgrimes daddr_t lbn; 5951541Srgrimes{ 5961541Srgrimes struct cluster_save *b_save; 5971541Srgrimes struct buf *bp, *tbp; 5981541Srgrimes caddr_t cp; 5991541Srgrimes int i, s; 6001541Srgrimes 6011541Srgrimes#ifdef DIAGNOSTIC 6021541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 6031541Srgrimes panic("cluster_wbuild: size %d != filesize %d\n", 6041541Srgrimes size, vp->v_mount->mnt_stat.f_iosize); 6051541Srgrimes#endif 6061541Srgrimesredo: 6071541Srgrimes while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 6081541Srgrimes ++start_lbn; 6091541Srgrimes --len; 6101541Srgrimes } 6111541Srgrimes 6121541Srgrimes /* Get more memory for current buffer */ 6131541Srgrimes if (len <= 1) { 6141541Srgrimes if (last_bp) { 6151541Srgrimes bawrite(last_bp); 6161541Srgrimes } else if (len) { 6171541Srgrimes bp = getblk(vp, start_lbn, size, 0, 0); 6181541Srgrimes bawrite(bp); 6191541Srgrimes } 6201541Srgrimes return; 6211541Srgrimes } 6221541Srgrimes 6231541Srgrimes bp = getblk(vp, start_lbn, size, 0, 0); 6241541Srgrimes if (!(bp->b_flags & B_DELWRI)) { 6251541Srgrimes ++start_lbn; 6261541Srgrimes --len; 6271541Srgrimes brelse(bp); 6281541Srgrimes goto redo; 6291541Srgrimes } 6301541Srgrimes 6311541Srgrimes /* 6321541Srgrimes * Extra memory in the buffer, punt on this buffer. 6331541Srgrimes * XXX we could handle this in most cases, but we would have to 6341541Srgrimes * push the extra memory down to after our max possible cluster 6351541Srgrimes * size and then potentially pull it back up if the cluster was 6361541Srgrimes * terminated prematurely--too much hassle. 6371541Srgrimes */ 6381541Srgrimes if (bp->b_bcount != bp->b_bufsize) { 6391541Srgrimes ++start_lbn; 6401541Srgrimes --len; 6411541Srgrimes bawrite(bp); 6421541Srgrimes goto redo; 6431541Srgrimes } 6441541Srgrimes 6451541Srgrimes --len; 6461541Srgrimes b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 6471541Srgrimes M_SEGMENT, M_WAITOK); 6481541Srgrimes b_save->bs_bcount = bp->b_bcount; 6491541Srgrimes b_save->bs_bufsize = bp->b_bufsize; 6501541Srgrimes b_save->bs_nchildren = 0; 6511541Srgrimes b_save->bs_children = (struct buf **)(b_save + 1); 6521541Srgrimes b_save->bs_saveaddr = bp->b_saveaddr; 6531541Srgrimes bp->b_saveaddr = (caddr_t) b_save; 6541541Srgrimes 6551541Srgrimes bp->b_flags |= B_CALL; 6561541Srgrimes bp->b_iodone = cluster_callback; 6571541Srgrimes cp = (char *)bp->b_data + size; 6581541Srgrimes for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 6591541Srgrimes /* 6601541Srgrimes * Block is not in core or the non-sequential block 6611541Srgrimes * ending our cluster was part of the cluster (in which 6621541Srgrimes * case we don't want to write it twice). 6631541Srgrimes */ 6641541Srgrimes if (!incore(vp, start_lbn) || 6651541Srgrimes last_bp == NULL && start_lbn == lbn) 6661541Srgrimes break; 6671541Srgrimes 6681541Srgrimes /* 6691541Srgrimes * Get the desired block buffer (unless it is the final 6701541Srgrimes * sequential block whose buffer was passed in explictly 6711541Srgrimes * as last_bp). 6721541Srgrimes */ 6731541Srgrimes if (last_bp == NULL || start_lbn != lbn) { 6741541Srgrimes tbp = getblk(vp, start_lbn, size, 0, 0); 6751541Srgrimes if (!(tbp->b_flags & B_DELWRI)) { 6761541Srgrimes brelse(tbp); 6771541Srgrimes break; 6781541Srgrimes } 6791541Srgrimes } else 6801541Srgrimes tbp = last_bp; 6811541Srgrimes 6821541Srgrimes ++b_save->bs_nchildren; 6831541Srgrimes 6841541Srgrimes /* Move memory from children to parent */ 6851541Srgrimes if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { 6861541Srgrimes printf("Clustered Block: %d addr %x bufsize: %d\n", 6871541Srgrimes bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 6881541Srgrimes printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 6891541Srgrimes tbp->b_blkno); 6901541Srgrimes panic("Clustered write to wrong blocks"); 6911541Srgrimes } 6921541Srgrimes 6931541Srgrimes pagemove(tbp->b_data, cp, size); 6941541Srgrimes bp->b_bcount += size; 6951541Srgrimes bp->b_bufsize += size; 6961541Srgrimes 6971541Srgrimes tbp->b_bufsize -= size; 6981541Srgrimes tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 6991937Sdg tbp->b_flags |= B_ASYNC; 7001541Srgrimes s = splbio(); 7011541Srgrimes reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 7021541Srgrimes ++tbp->b_vp->v_numoutput; 7031541Srgrimes splx(s); 7041541Srgrimes b_save->bs_children[i] = tbp; 7051541Srgrimes 7061541Srgrimes cp += size; 7071541Srgrimes } 7081541Srgrimes 7091541Srgrimes if (i == 0) { 7101541Srgrimes /* None to cluster */ 7111541Srgrimes bp->b_saveaddr = b_save->bs_saveaddr; 7121541Srgrimes bp->b_flags &= ~B_CALL; 7131541Srgrimes bp->b_iodone = NULL; 7141541Srgrimes free(b_save, M_SEGMENT); 7151541Srgrimes } 7161541Srgrimes bawrite(bp); 7171541Srgrimes if (i < len) { 7181541Srgrimes len -= i + 1; 7191541Srgrimes start_lbn += 1; 7201541Srgrimes goto redo; 7211541Srgrimes } 7221541Srgrimes} 7231541Srgrimes 7241541Srgrimes/* 7251541Srgrimes * Collect together all the buffers in a cluster. 7261541Srgrimes * Plus add one additional buffer. 7271541Srgrimes */ 7281541Srgrimesstruct cluster_save * 7291541Srgrimescluster_collectbufs(vp, last_bp) 7301541Srgrimes struct vnode *vp; 7311541Srgrimes struct buf *last_bp; 7321541Srgrimes{ 7331541Srgrimes struct cluster_save *buflist; 7341541Srgrimes daddr_t lbn; 7351541Srgrimes int i, len; 7361541Srgrimes 7371541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 7381541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 7391541Srgrimes M_SEGMENT, M_WAITOK); 7401541Srgrimes buflist->bs_nchildren = 0; 7411541Srgrimes buflist->bs_children = (struct buf **)(buflist + 1); 7421541Srgrimes for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 7431541Srgrimes (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, 7441541Srgrimes &buflist->bs_children[i]); 7451541Srgrimes buflist->bs_children[i] = last_bp; 7461541Srgrimes buflist->bs_nchildren = i + 1; 7471541Srgrimes return (buflist); 7481541Srgrimes} 749