vfs_cluster.c revision 92723
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 3650477Speter * $FreeBSD: head/sys/kern/vfs_cluster.c 92723 2002-03-19 21:25:46Z alfred $ 371541Srgrimes */ 381541Srgrimes 3932929Seivind#include "opt_debug_cluster.h" 4032929Seivind 411541Srgrimes#include <sys/param.h> 421549Srgrimes#include <sys/systm.h> 4341168Sbde#include <sys/kernel.h> 441541Srgrimes#include <sys/proc.h> 4560041Sphk#include <sys/bio.h> 461541Srgrimes#include <sys/buf.h> 471541Srgrimes#include <sys/vnode.h> 4841124Sdg#include <sys/malloc.h> 491541Srgrimes#include <sys/mount.h> 501541Srgrimes#include <sys/resourcevar.h> 5168885Sdillon#include <sys/vmmeter.h> 526621Sdg#include <vm/vm.h> 5310541Sdyson#include <vm/vm_object.h> 5410541Sdyson#include <vm/vm_page.h> 5548545Smckusick#include <sys/sysctl.h> 561541Srgrimes 5721002Sdyson#if defined(CLUSTERDEBUG) 5821002Sdyson#include <sys/sysctl.h> 5921002Sdysonstatic int rcluster= 0; 6091690SeivindSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, 6191690Seivind "Debug VFS clustering code"); 6221002Sdyson#endif 6321002Sdyson 6441124Sdgstatic MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 6541124Sdg 6612973Sbdestatic struct cluster_save * 6792723Salfred cluster_collectbufs(struct vnode *vp, struct buf *last_bp); 6812973Sbdestatic struct buf * 6992723Salfred cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, 7092723Salfred daddr64_t blkno, long size, int run, struct buf *fbp); 711541Srgrimes 7248545Smckusickstatic int write_behind = 1; 7391690SeivindSYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, 7491690Seivind "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); 7548545Smckusick 7691690Seivind/* Page expended to mark partially backed buffers */ 7712973Sbdeextern vm_page_t bogus_page; 785455Sdg 7991690Seivind/* 8091690Seivind * Number of physical bufs (pbufs) this subsystem is allowed. 8191690Seivind * Manipulated by vm_pager.c 8291690Seivind */ 8342957Sdillonextern int cluster_pbuf_freecnt; 8442957Sdillon 851541Srgrimes/* 8621002Sdyson * Maximum number of blocks for read-ahead. 871541Srgrimes */ 8821002Sdyson#define MAXRA 32 895455Sdg 901541Srgrimes/* 9191690Seivind * Read data to a buf, including read-ahead if we find this to be beneficial. 9291690Seivind * cluster_read replaces bread. 9310541Sdyson */ 941549Srgrimesint 9521002Sdysoncluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 961541Srgrimes struct vnode *vp; 971541Srgrimes u_quad_t filesize; 981541Srgrimes daddr_t lblkno; 991541Srgrimes long size; 1001541Srgrimes struct ucred *cred; 10121002Sdyson long totread; 10221002Sdyson int seqcount; 1031541Srgrimes struct buf **bpp; 1041541Srgrimes{ 10521002Sdyson struct buf *bp, *rbp, *reqbp; 10692363Smckusick daddr64_t blkno, origblkno; 10721002Sdyson int error, num_ra; 10810541Sdyson int i; 10921002Sdyson int maxra, racluster; 11021002Sdyson long origtotread; 1111541Srgrimes 1121541Srgrimes error = 0; 11321002Sdyson 1145455Sdg /* 11521002Sdyson * Try to limit the amount of read-ahead by a few 11621002Sdyson * ad-hoc parameters. This needs work!!! 11721002Sdyson */ 11851797Sphk racluster = vp->v_mount->mnt_iosize_max / size; 11921002Sdyson maxra = 2 * racluster + (totread / size); 12021002Sdyson if (maxra > MAXRA) 12121002Sdyson maxra = MAXRA; 12221002Sdyson if (maxra > nbuf/8) 12321002Sdyson maxra = nbuf/8; 12421002Sdyson 12521002Sdyson /* 1265455Sdg * get the requested block 1275455Sdg */ 12821002Sdyson *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 12921002Sdyson origblkno = lblkno; 13021002Sdyson origtotread = totread; 13112767Sdyson 1325455Sdg /* 1335455Sdg * if it is in the cache, then check to see if the reads have been 1345455Sdg * sequential. If they have, then try some read-ahead, otherwise 1355455Sdg * back-off on prospective read-aheads. 1365455Sdg */ 1371541Srgrimes if (bp->b_flags & B_CACHE) { 13821002Sdyson if (!seqcount) { 1395455Sdg return 0; 14021002Sdyson } else if ((bp->b_flags & B_RAM) == 0) { 14121002Sdyson return 0; 14221002Sdyson } else { 14321002Sdyson int s; 14421002Sdyson struct buf *tbp; 14521002Sdyson bp->b_flags &= ~B_RAM; 14621002Sdyson /* 14721002Sdyson * We do the spl here so that there is no window 14821002Sdyson * between the incore and the b_usecount increment 14921002Sdyson * below. We opt to keep the spl out of the loop 15021002Sdyson * for efficiency. 15121002Sdyson */ 15221002Sdyson s = splbio(); 15348225Smckusick for (i = 1; i < maxra; i++) { 15421002Sdyson 15521002Sdyson if (!(tbp = incore(vp, lblkno+i))) { 15621002Sdyson break; 15721002Sdyson } 15821002Sdyson 15921002Sdyson /* 16048677Smckusick * Set another read-ahead mark so we know 16148677Smckusick * to check again. 16221002Sdyson */ 16321002Sdyson if (((i % racluster) == (racluster - 1)) || 16421002Sdyson (i == (maxra - 1))) 16521002Sdyson tbp->b_flags |= B_RAM; 16621002Sdyson } 16721002Sdyson splx(s); 16821002Sdyson if (i >= maxra) { 1695839Sdg return 0; 17010541Sdyson } 17121002Sdyson lblkno += i; 17221002Sdyson } 17321002Sdyson reqbp = bp = NULL; 17421002Sdyson } else { 17542453Seivind off_t firstread = bp->b_offset; 17642453Seivind 17742408Seivind KASSERT(bp->b_offset != NOOFFSET, 17842453Seivind ("cluster_read: no buffer offset")); 17921002Sdyson if (firstread + totread > filesize) 18021002Sdyson totread = filesize - firstread; 18121002Sdyson if (totread > size) { 18221002Sdyson int nblks = 0; 18321002Sdyson int ncontigafter; 18421002Sdyson while (totread > 0) { 18521002Sdyson nblks++; 18621002Sdyson totread -= size; 18721002Sdyson } 18821002Sdyson if (nblks == 1) 18921002Sdyson goto single_block_read; 19021002Sdyson if (nblks > racluster) 19121002Sdyson nblks = racluster; 19221002Sdyson 19321002Sdyson error = VOP_BMAP(vp, lblkno, NULL, 19421002Sdyson &blkno, &ncontigafter, NULL); 19521002Sdyson if (error) 19621002Sdyson goto single_block_read; 19721002Sdyson if (blkno == -1) 19821002Sdyson goto single_block_read; 19921002Sdyson if (ncontigafter == 0) 20021002Sdyson goto single_block_read; 20121002Sdyson if (ncontigafter + 1 < nblks) 20221002Sdyson nblks = ncontigafter + 1; 20321002Sdyson 20421002Sdyson bp = cluster_rbuild(vp, filesize, lblkno, 20521002Sdyson blkno, size, nblks, bp); 20634694Sdyson lblkno += (bp->b_bufsize / size); 20710541Sdyson } else { 20821002Sdysonsingle_block_read: 20921002Sdyson /* 21021002Sdyson * if it isn't in the cache, then get a chunk from 21121002Sdyson * disk if sequential, otherwise just get the block. 21221002Sdyson */ 21358345Sphk bp->b_flags |= B_RAM; 21458345Sphk bp->b_iocmd = BIO_READ; 21510541Sdyson lblkno += 1; 2168876Srgrimes } 2171541Srgrimes } 2185455Sdg 2195455Sdg /* 2205455Sdg * if we have been doing sequential I/O, then do some read-ahead 2215455Sdg */ 22221002Sdyson rbp = NULL; 22321002Sdyson if (seqcount && (lblkno < (origblkno + seqcount))) { 2241541Srgrimes /* 22521002Sdyson * we now build the read-ahead buffer if it is desirable. 2261541Srgrimes */ 22721002Sdyson if (((u_quad_t)(lblkno + 1) * size) <= filesize && 22821002Sdyson !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 22921002Sdyson blkno != -1) { 23021002Sdyson int nblksread; 23121002Sdyson int ntoread = num_ra + 1; 23221002Sdyson nblksread = (origtotread + size - 1) / size; 23321002Sdyson if (seqcount < nblksread) 23421002Sdyson seqcount = nblksread; 23521002Sdyson if (seqcount < ntoread) 23621002Sdyson ntoread = seqcount; 23721002Sdyson if (num_ra) { 23821002Sdyson rbp = cluster_rbuild(vp, filesize, lblkno, 23921002Sdyson blkno, size, ntoread, NULL); 24021002Sdyson } else { 24121002Sdyson rbp = getblk(vp, lblkno, size, 0, 0); 24258345Sphk rbp->b_flags |= B_ASYNC | B_RAM; 24358345Sphk rbp->b_iocmd = BIO_READ; 24421002Sdyson rbp->b_blkno = blkno; 2455455Sdg } 2461541Srgrimes } 2475455Sdg } 2481541Srgrimes 2495455Sdg /* 25010541Sdyson * handle the synchronous read 2515455Sdg */ 2525455Sdg if (bp) { 25321002Sdyson#if defined(CLUSTERDEBUG) 25436275Sdyson if (rcluster) 25537951Sbde printf("S(%ld,%ld,%d) ", 25637951Sbde (long)bp->b_lblkno, bp->b_bcount, seqcount); 25721002Sdyson#endif 25870374Sdillon if ((bp->b_flags & B_CLUSTER) == 0) { 25936275Sdyson vfs_busy_pages(bp, 0); 26070374Sdillon } 26158934Sphk bp->b_flags &= ~B_INVAL; 26258934Sphk bp->b_ioflags &= ~BIO_ERROR; 26358345Sphk if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) 26448333Speter BUF_KERNPROC(bp); 26537384Sjulian error = VOP_STRATEGY(vp, bp); 26636275Sdyson curproc->p_stats->p_ru.ru_inblock++; 2675455Sdg } 26834611Sdyson 2695455Sdg /* 2705455Sdg * and if we have read-aheads, do them too 2715455Sdg */ 2725455Sdg if (rbp) { 27313490Sdyson if (error) { 27458345Sphk rbp->b_flags &= ~B_ASYNC; 2751541Srgrimes brelse(rbp); 27613490Sdyson } else if (rbp->b_flags & B_CACHE) { 27758345Sphk rbp->b_flags &= ~B_ASYNC; 27813490Sdyson bqrelse(rbp); 2795455Sdg } else { 28021002Sdyson#if defined(CLUSTERDEBUG) 28121002Sdyson if (rcluster) { 28221002Sdyson if (bp) 28337951Sbde printf("A+(%ld,%ld,%ld,%d) ", 28437951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 28537951Sbde (long)(rbp->b_lblkno - origblkno), 28637951Sbde seqcount); 28721002Sdyson else 28837951Sbde printf("A(%ld,%ld,%ld,%d) ", 28937951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 29037951Sbde (long)(rbp->b_lblkno - origblkno), 29137951Sbde seqcount); 29221002Sdyson } 29321002Sdyson#endif 29421002Sdyson 29570374Sdillon if ((rbp->b_flags & B_CLUSTER) == 0) { 29610541Sdyson vfs_busy_pages(rbp, 0); 29770374Sdillon } 29858934Sphk rbp->b_flags &= ~B_INVAL; 29958934Sphk rbp->b_ioflags &= ~BIO_ERROR; 30058345Sphk if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) 30148333Speter BUF_KERNPROC(rbp); 30237384Sjulian (void) VOP_STRATEGY(vp, rbp); 3035455Sdg curproc->p_stats->p_ru.ru_inblock++; 3045455Sdg } 3055455Sdg } 30621002Sdyson if (reqbp) 30759762Sphk return (bufwait(reqbp)); 30821002Sdyson else 30921002Sdyson return (error); 3101541Srgrimes} 3111541Srgrimes 3121541Srgrimes/* 3131541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 3141541Srgrimes * read ahead. We will read as many blocks as possible sequentially 3151541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 3161541Srgrimes */ 31710541Sdysonstatic struct buf * 31821002Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 3191541Srgrimes struct vnode *vp; 3201541Srgrimes u_quad_t filesize; 3211541Srgrimes daddr_t lbn; 32292363Smckusick daddr64_t blkno; 3231541Srgrimes long size; 3241541Srgrimes int run; 32521002Sdyson struct buf *fbp; 3261541Srgrimes{ 32710541Sdyson struct buf *bp, *tbp; 3281541Srgrimes daddr_t bn; 32940648Sphk int i, inc, j; 3301541Srgrimes 33179224Sdillon GIANT_REQUIRED; 33279224Sdillon 33342408Seivind KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 33442453Seivind ("cluster_rbuild: size %ld != filesize %ld\n", 33542453Seivind size, vp->v_mount->mnt_stat.f_iosize)); 33642453Seivind 33712767Sdyson /* 33812767Sdyson * avoid a division 33912767Sdyson */ 34012767Sdyson while ((u_quad_t) size * (lbn + run) > filesize) { 3411541Srgrimes --run; 34212767Sdyson } 34310541Sdyson 34421002Sdyson if (fbp) { 34521002Sdyson tbp = fbp; 34658345Sphk tbp->b_iocmd = BIO_READ; 34721002Sdyson } else { 34821002Sdyson tbp = getblk(vp, lbn, size, 0, 0); 34921002Sdyson if (tbp->b_flags & B_CACHE) 35021002Sdyson return tbp; 35158345Sphk tbp->b_flags |= B_ASYNC | B_RAM; 35258345Sphk tbp->b_iocmd = BIO_READ; 35321002Sdyson } 35410541Sdyson 35510541Sdyson tbp->b_blkno = blkno; 35616086Sdyson if( (tbp->b_flags & B_MALLOC) || 35716086Sdyson ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 35810541Sdyson return tbp; 35910541Sdyson 36042957Sdillon bp = trypbuf(&cluster_pbuf_freecnt); 36110541Sdyson if (bp == 0) 36210541Sdyson return tbp; 36310541Sdyson 36485272Sdillon /* 36585272Sdillon * We are synthesizing a buffer out of vm_page_t's, but 36685272Sdillon * if the block size is not page aligned then the starting 36785272Sdillon * address may not be either. Inherit the b_data offset 36885272Sdillon * from the original buffer. 36985272Sdillon */ 37037467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 37137467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 37258345Sphk bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; 37358345Sphk bp->b_iocmd = BIO_READ; 3745455Sdg bp->b_iodone = cluster_callback; 3755455Sdg bp->b_blkno = blkno; 3765455Sdg bp->b_lblkno = lbn; 37734611Sdyson bp->b_offset = tbp->b_offset; 37842453Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 3795455Sdg pbgetvp(vp, bp); 3801541Srgrimes 38112404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 3821541Srgrimes 3835455Sdg bp->b_bcount = 0; 3845455Sdg bp->b_bufsize = 0; 3855455Sdg bp->b_npages = 0; 3865455Sdg 3871541Srgrimes inc = btodb(size); 38810541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3895455Sdg if (i != 0) { 39012767Sdyson if ((bp->b_npages * PAGE_SIZE) + 39185272Sdillon round_page(size) > vp->v_mount->mnt_iosize_max) { 39210541Sdyson break; 39385272Sdillon } 39410978Sdyson 39585272Sdillon /* 39685272Sdillon * Shortcut some checks and try to avoid buffers that 39785272Sdillon * would block in the lock. The same checks have to 39885272Sdillon * be made again after we officially get the buffer. 39985272Sdillon */ 40043301Sdillon if ((tbp = incore(vp, lbn + i)) != NULL) { 40148225Smckusick if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) 40234611Sdyson break; 40348225Smckusick BUF_UNLOCK(tbp); 40412767Sdyson 40585272Sdillon for (j = 0; j < tbp->b_npages; j++) { 40634611Sdyson if (tbp->b_pages[j]->valid) 40734611Sdyson break; 40885272Sdillon } 40934611Sdyson 41034611Sdyson if (j != tbp->b_npages) 41134611Sdyson break; 41234611Sdyson 41334611Sdyson if (tbp->b_bcount != size) 41434611Sdyson break; 41534611Sdyson } 41634611Sdyson 4175455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 41810541Sdyson 41971230Sdillon /* 42085272Sdillon * Stop scanning if the buffer is fully valid 42185272Sdillon * (marked B_CACHE), or locked (may be doing a 42285272Sdillon * background write), or if the buffer is not 42385272Sdillon * VMIO backed. The clustering code can only deal 42485272Sdillon * with VMIO-backed buffers. 42571230Sdillon */ 42671230Sdillon if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || 42710541Sdyson (tbp->b_flags & B_VMIO) == 0) { 42813490Sdyson bqrelse(tbp); 4295455Sdg break; 4305455Sdg } 43110541Sdyson 43285272Sdillon /* 43385272Sdillon * The buffer must be completely invalid in order to 43485272Sdillon * take part in the cluster. If it is partially valid 43585272Sdillon * then we stop. 43685272Sdillon */ 43771230Sdillon for (j = 0;j < tbp->b_npages; j++) { 43834611Sdyson if (tbp->b_pages[j]->valid) 43910541Sdyson break; 44071230Sdillon } 44110541Sdyson if (j != tbp->b_npages) { 44234611Sdyson bqrelse(tbp); 44310541Sdyson break; 44410541Sdyson } 44510541Sdyson 44685272Sdillon /* 44785272Sdillon * Set a read-ahead mark as appropriate 44885272Sdillon */ 44921002Sdyson if ((fbp && (i == 1)) || (i == (run - 1))) 45021002Sdyson tbp->b_flags |= B_RAM; 45185272Sdillon 45285272Sdillon /* 45385272Sdillon * Set the buffer up for an async read (XXX should 45485272Sdillon * we do this only if we do not wind up brelse()ing?). 45585272Sdillon * Set the block number if it isn't set, otherwise 45685272Sdillon * if it is make sure it matches the block number we 45785272Sdillon * expect. 45885272Sdillon */ 45958345Sphk tbp->b_flags |= B_ASYNC; 46058345Sphk tbp->b_iocmd = BIO_READ; 46112767Sdyson if (tbp->b_blkno == tbp->b_lblkno) { 46210541Sdyson tbp->b_blkno = bn; 46310541Sdyson } else if (tbp->b_blkno != bn) { 46410541Sdyson brelse(tbp); 46510541Sdyson break; 46610541Sdyson } 4671541Srgrimes } 46848333Speter /* 46948333Speter * XXX fbp from caller may not be B_ASYNC, but we are going 47048333Speter * to biodone() it in cluster_callback() anyway 47148333Speter */ 47248333Speter BUF_KERNPROC(tbp); 47312404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 47412404Sdyson tbp, b_cluster.cluster_entry); 4755455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 47610541Sdyson vm_page_t m; 47710541Sdyson m = tbp->b_pages[j]; 47838799Sdfr vm_page_io_start(m); 47938517Sdfr vm_object_pip_add(m->object, 1); 48010541Sdyson if ((bp->b_npages == 0) || 48112413Sdyson (bp->b_pages[bp->b_npages-1] != m)) { 48210541Sdyson bp->b_pages[bp->b_npages] = m; 48310541Sdyson bp->b_npages++; 48410541Sdyson } 48518737Sdyson if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 48618737Sdyson tbp->b_pages[j] = bogus_page; 4871541Srgrimes } 48885511Sdillon /* 48985511Sdillon * XXX shouldn't this be += size for both, like in 49085511Sdillon * cluster_wbuild()? 49185511Sdillon * 49285511Sdillon * Don't inherit tbp->b_bufsize as it may be larger due to 49385511Sdillon * a non-page-aligned size. Instead just aggregate using 49485511Sdillon * 'size'. 49585511Sdillon */ 49685511Sdillon if (tbp->b_bcount != size) 49785511Sdillon printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); 49885511Sdillon if (tbp->b_bufsize != size) 49985511Sdillon printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); 50085511Sdillon bp->b_bcount += size; 50185511Sdillon bp->b_bufsize += size; 5021541Srgrimes } 50318737Sdyson 50485272Sdillon /* 50585272Sdillon * Fully valid pages in the cluster are already good and do not need 50685272Sdillon * to be re-read from disk. Replace the page with bogus_page 50785272Sdillon */ 50885272Sdillon for (j = 0; j < bp->b_npages; j++) { 50918737Sdyson if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 51085272Sdillon VM_PAGE_BITS_ALL) { 51118737Sdyson bp->b_pages[j] = bogus_page; 51285272Sdillon } 51318737Sdyson } 51420054Sdyson if (bp->b_bufsize > bp->b_kvasize) 51537559Sbde panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 51637559Sbde bp->b_bufsize, bp->b_kvasize); 51720054Sdyson bp->b_kvasize = bp->b_bufsize; 51818737Sdyson 51910541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 52010541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 5215455Sdg return (bp); 5221541Srgrimes} 5231541Srgrimes 5241541Srgrimes/* 5251541Srgrimes * Cleanup after a clustered read or write. 5261541Srgrimes * This is complicated by the fact that any of the buffers might have 5271541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 5281541Srgrimes * that we will need to shift around. 5291541Srgrimes */ 5301541Srgrimesvoid 5311541Srgrimescluster_callback(bp) 5321541Srgrimes struct buf *bp; 5331541Srgrimes{ 53412404Sdyson struct buf *nbp, *tbp; 5351541Srgrimes int error = 0; 5361541Srgrimes 53779224Sdillon GIANT_REQUIRED; 53879224Sdillon 5391541Srgrimes /* 5401541Srgrimes * Must propogate errors to all the components. 5411541Srgrimes */ 54258934Sphk if (bp->b_ioflags & BIO_ERROR) 5431541Srgrimes error = bp->b_error; 5441541Srgrimes 54510541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 5461541Srgrimes /* 5471541Srgrimes * Move memory from the large cluster buffer into the component 5481541Srgrimes * buffers and mark IO as done on these. 5491541Srgrimes */ 55021002Sdyson for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 55112404Sdyson tbp; tbp = nbp) { 55221002Sdyson nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 5531541Srgrimes if (error) { 55458934Sphk tbp->b_ioflags |= BIO_ERROR; 5551541Srgrimes tbp->b_error = error; 55646349Salc } else { 55746349Salc tbp->b_dirtyoff = tbp->b_dirtyend = 0; 55858934Sphk tbp->b_flags &= ~B_INVAL; 55958934Sphk tbp->b_ioflags &= ~BIO_ERROR; 56077115Sdillon /* 56177115Sdillon * XXX the bdwrite()/bqrelse() issued during 56277115Sdillon * cluster building clears B_RELBUF (see bqrelse() 56377115Sdillon * comment). If direct I/O was specified, we have 56477115Sdillon * to restore it here to allow the buffer and VM 56577115Sdillon * to be freed. 56677115Sdillon */ 56777115Sdillon if (tbp->b_flags & B_DIRECT) 56877115Sdillon tbp->b_flags |= B_RELBUF; 56946349Salc } 57059249Sphk bufdone(tbp); 5711541Srgrimes } 57242957Sdillon relpbuf(bp, &cluster_pbuf_freecnt); 5731541Srgrimes} 5741541Srgrimes 5751541Srgrimes/* 57648545Smckusick * cluster_wbuild_wb: 57748545Smckusick * 57848545Smckusick * Implement modified write build for cluster. 57948545Smckusick * 58048545Smckusick * write_behind = 0 write behind disabled 58148545Smckusick * write_behind = 1 write behind normal (default) 58248545Smckusick * write_behind = 2 write behind backed-off 58348545Smckusick */ 58448545Smckusick 58548545Smckusickstatic __inline int 58648545Smckusickcluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) 58748545Smckusick{ 58848545Smckusick int r = 0; 58948545Smckusick 59048545Smckusick switch(write_behind) { 59148545Smckusick case 2: 59248545Smckusick if (start_lbn < len) 59348545Smckusick break; 59448545Smckusick start_lbn -= len; 59548545Smckusick /* fall through */ 59648545Smckusick case 1: 59748545Smckusick r = cluster_wbuild(vp, size, start_lbn, len); 59848545Smckusick /* fall through */ 59948545Smckusick default: 60048545Smckusick /* fall through */ 60148545Smckusick break; 60248545Smckusick } 60348545Smckusick return(r); 60448545Smckusick} 60548545Smckusick 60648545Smckusick/* 6071541Srgrimes * Do clustered write for FFS. 6081541Srgrimes * 6091541Srgrimes * Three cases: 6101541Srgrimes * 1. Write is not sequential (write asynchronously) 6111541Srgrimes * Write is sequential: 6121541Srgrimes * 2. beginning of cluster - begin cluster 6131541Srgrimes * 3. middle of a cluster - add to cluster 6141541Srgrimes * 4. end of a cluster - asynchronously write cluster 6151541Srgrimes */ 6161541Srgrimesvoid 61758909Sdilloncluster_write(bp, filesize, seqcount) 6185455Sdg struct buf *bp; 6191541Srgrimes u_quad_t filesize; 62058909Sdillon int seqcount; 6211541Srgrimes{ 6225455Sdg struct vnode *vp; 6235455Sdg daddr_t lbn; 6245455Sdg int maxclen, cursize; 6255455Sdg int lblocksize; 62612404Sdyson int async; 6271541Srgrimes 6285455Sdg vp = bp->b_vp; 62932286Sdyson if (vp->v_type == VREG) { 63032286Sdyson async = vp->v_mount->mnt_flag & MNT_ASYNC; 63132286Sdyson lblocksize = vp->v_mount->mnt_stat.f_iosize; 63232286Sdyson } else { 63332286Sdyson async = 0; 63432286Sdyson lblocksize = bp->b_bufsize; 63532286Sdyson } 6365455Sdg lbn = bp->b_lblkno; 63742408Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 63834694Sdyson 6391541Srgrimes /* Initialize vnode to beginning of file. */ 6401541Srgrimes if (lbn == 0) 6411541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 6421541Srgrimes 6435455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 6445455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 64551797Sphk maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; 6461541Srgrimes if (vp->v_clen != 0) { 6471541Srgrimes /* 6481541Srgrimes * Next block is not sequential. 6498876Srgrimes * 6501541Srgrimes * If we are not writing at end of file, the process 6515455Sdg * seeked to another point in the file since its last 6525455Sdg * write, or we have reached our maximum cluster size, 6535455Sdg * then push the previous cluster. Otherwise try 6545455Sdg * reallocating to make it sequential. 65558909Sdillon * 65658909Sdillon * Change to algorithm: only push previous cluster if 65758909Sdillon * it was sequential from the point of view of the 65858909Sdillon * seqcount heuristic, otherwise leave the buffer 65958909Sdillon * intact so we can potentially optimize the I/O 66058909Sdillon * later on in the buf_daemon or update daemon 66158909Sdillon * flush. 6621541Srgrimes */ 6631541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 66434611Sdyson if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 66510541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 66658909Sdillon if (!async && seqcount > 0) { 66748677Smckusick cluster_wbuild_wb(vp, lblocksize, 66812404Sdyson vp->v_cstart, cursize); 66958909Sdillon } 67010541Sdyson } else { 67110541Sdyson struct buf **bpp, **endbp; 67210541Sdyson struct cluster_save *buflist; 67310541Sdyson 67410541Sdyson buflist = cluster_collectbufs(vp, bp); 67510541Sdyson endbp = &buflist->bs_children 67610541Sdyson [buflist->bs_nchildren - 1]; 67710541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 67810541Sdyson /* 67958909Sdillon * Failed, push the previous cluster 68058909Sdillon * if *really* writing sequentially 68158909Sdillon * in the logical file (seqcount > 1), 68258909Sdillon * otherwise delay it in the hopes that 68358909Sdillon * the low level disk driver can 68458909Sdillon * optimize the write ordering. 68510541Sdyson */ 68610541Sdyson for (bpp = buflist->bs_children; 68710541Sdyson bpp < endbp; bpp++) 68810541Sdyson brelse(*bpp); 68910541Sdyson free(buflist, M_SEGMENT); 69058909Sdillon if (seqcount > 1) { 69158909Sdillon cluster_wbuild_wb(vp, 69258909Sdillon lblocksize, vp->v_cstart, 69358909Sdillon cursize); 69458909Sdillon } 69510541Sdyson } else { 69610541Sdyson /* 69710541Sdyson * Succeeded, keep building cluster. 69810541Sdyson */ 69910541Sdyson for (bpp = buflist->bs_children; 70010541Sdyson bpp <= endbp; bpp++) 70110541Sdyson bdwrite(*bpp); 70210541Sdyson free(buflist, M_SEGMENT); 70310541Sdyson vp->v_lastw = lbn; 70410541Sdyson vp->v_lasta = bp->b_blkno; 70510541Sdyson return; 70610541Sdyson } 70710541Sdyson } 7081541Srgrimes } 7091541Srgrimes /* 7105455Sdg * Consider beginning a cluster. If at end of file, make 7115455Sdg * cluster as large as possible, otherwise find size of 7125455Sdg * existing cluster. 7131541Srgrimes */ 71432286Sdyson if ((vp->v_type == VREG) && 71534611Sdyson ((u_quad_t) bp->b_offset + lblocksize) != filesize && 7167613Sdg (bp->b_blkno == bp->b_lblkno) && 71710551Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 71810541Sdyson bp->b_blkno == -1)) { 7191541Srgrimes bawrite(bp); 7201541Srgrimes vp->v_clen = 0; 7211541Srgrimes vp->v_lasta = bp->b_blkno; 7221541Srgrimes vp->v_cstart = lbn + 1; 7231541Srgrimes vp->v_lastw = lbn; 7241541Srgrimes return; 7251541Srgrimes } 7265455Sdg vp->v_clen = maxclen; 72712404Sdyson if (!async && maxclen == 0) { /* I/O not contiguous */ 7281541Srgrimes vp->v_cstart = lbn + 1; 72913490Sdyson bawrite(bp); 7305455Sdg } else { /* Wait for rest of cluster */ 7311541Srgrimes vp->v_cstart = lbn; 7325455Sdg bdwrite(bp); 7331541Srgrimes } 7341541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 7351541Srgrimes /* 73658909Sdillon * At end of cluster, write it out if seqcount tells us we 73758909Sdillon * are operating sequentially, otherwise let the buf or 73858909Sdillon * update daemon handle it. 7391541Srgrimes */ 74012404Sdyson bdwrite(bp); 74158909Sdillon if (seqcount > 1) 74258909Sdillon cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 7431541Srgrimes vp->v_clen = 0; 7441541Srgrimes vp->v_cstart = lbn + 1; 74568885Sdillon } else if (vm_page_count_severe()) { 74668885Sdillon /* 74768885Sdillon * We are low on memory, get it going NOW 74868885Sdillon */ 74968885Sdillon bawrite(bp); 75058909Sdillon } else { 7511541Srgrimes /* 7525455Sdg * In the middle of a cluster, so just delay the I/O for now. 7531541Srgrimes */ 7541541Srgrimes bdwrite(bp); 75558909Sdillon } 7561541Srgrimes vp->v_lastw = lbn; 7571541Srgrimes vp->v_lasta = bp->b_blkno; 7581541Srgrimes} 7591541Srgrimes 7601541Srgrimes 7611541Srgrimes/* 7621541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 7631541Srgrimes * The last lbn argument is the current block on which I/O is being 7641541Srgrimes * performed. Check to see that it doesn't fall in the middle of 7651541Srgrimes * the current block (if last_bp == NULL). 7661541Srgrimes */ 76712767Sdysonint 76812404Sdysoncluster_wbuild(vp, size, start_lbn, len) 7691541Srgrimes struct vnode *vp; 7701541Srgrimes long size; 7711541Srgrimes daddr_t start_lbn; 7721541Srgrimes int len; 7731541Srgrimes{ 77412404Sdyson struct buf *bp, *tbp; 7755455Sdg int i, j, s; 77612767Sdyson int totalwritten = 0; 77712404Sdyson int dbsize = btodb(size); 77835595Sbde 77979224Sdillon GIANT_REQUIRED; 78079224Sdillon 78112767Sdyson while (len > 0) { 78212767Sdyson s = splbio(); 78371230Sdillon /* 78471230Sdillon * If the buffer is not delayed-write (i.e. dirty), or it 78571230Sdillon * is delayed-write but either locked or inval, it cannot 78672080Sasmodai * partake in the clustered write. 78771230Sdillon */ 78832286Sdyson if (((tbp = gbincore(vp, start_lbn)) == NULL) || 78971230Sdillon ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) || 79048225Smckusick BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { 79112767Sdyson ++start_lbn; 79212767Sdyson --len; 79312767Sdyson splx(s); 79412767Sdyson continue; 79512767Sdyson } 79612767Sdyson bremfree(tbp); 79712767Sdyson tbp->b_flags &= ~B_DONE; 79812767Sdyson splx(s); 7991541Srgrimes 80047967Sjulian /* 80147967Sjulian * Extra memory in the buffer, punt on this buffer. 80247967Sjulian * XXX we could handle this in most cases, but we would 80347967Sjulian * have to push the extra memory down to after our max 80447967Sjulian * possible cluster size and then potentially pull it back 80547967Sjulian * up if the cluster was terminated prematurely--too much 80647967Sjulian * hassle. 80747967Sjulian */ 80868868Stegge if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 80968868Stegge (B_CLUSTEROK | B_VMIO)) || 81034630Sjulian (tbp->b_bcount != tbp->b_bufsize) || 81134630Sjulian (tbp->b_bcount != size) || 81234630Sjulian (len == 1) || 81347948Sdg ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { 81412767Sdyson totalwritten += tbp->b_bufsize; 81512767Sdyson bawrite(tbp); 81612767Sdyson ++start_lbn; 81712767Sdyson --len; 81812767Sdyson continue; 81912767Sdyson } 82012404Sdyson 82134630Sjulian /* 82234630Sjulian * We got a pbuf to make the cluster in. 82334630Sjulian * so initialise it. 82434630Sjulian */ 82512767Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 82612767Sdyson bp->b_bcount = 0; 82775580Sphk bp->b_magic = tbp->b_magic; 82875580Sphk bp->b_op = tbp->b_op; 82912767Sdyson bp->b_bufsize = 0; 83012767Sdyson bp->b_npages = 0; 83184827Sjhb if (tbp->b_wcred != NOCRED) 83284827Sjhb bp->b_wcred = crhold(tbp->b_wcred); 8331541Srgrimes 83412767Sdyson bp->b_blkno = tbp->b_blkno; 83512767Sdyson bp->b_lblkno = tbp->b_lblkno; 83634611Sdyson bp->b_offset = tbp->b_offset; 83785272Sdillon 83885272Sdillon /* 83985272Sdillon * We are synthesizing a buffer out of vm_page_t's, but 84085272Sdillon * if the block size is not page aligned then the starting 84185272Sdillon * address may not be either. Inherit the b_data offset 84285272Sdillon * from the original buffer. 84385272Sdillon */ 84437467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 84537467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 84658345Sphk bp->b_flags |= B_CLUSTER | 84786089Sdillon (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT | B_NOWDRAIN)); 84812767Sdyson bp->b_iodone = cluster_callback; 84912767Sdyson pbgetvp(vp, bp); 85034630Sjulian /* 85134630Sjulian * From this location in the file, scan forward to see 85234630Sjulian * if there are buffers with adjacent data that need to 85334630Sjulian * be written as well. 85434630Sjulian */ 85512767Sdyson for (i = 0; i < len; ++i, ++start_lbn) { 85634630Sjulian if (i != 0) { /* If not the first buffer */ 85712767Sdyson s = splbio(); 85834630Sjulian /* 85934630Sjulian * If the adjacent data is not even in core it 86034630Sjulian * can't need to be written. 86134630Sjulian */ 86212767Sdyson if ((tbp = gbincore(vp, start_lbn)) == NULL) { 86312767Sdyson splx(s); 86412767Sdyson break; 86512767Sdyson } 8661541Srgrimes 86734630Sjulian /* 86834630Sjulian * If it IS in core, but has different 86971230Sdillon * characteristics, or is locked (which 87071230Sdillon * means it could be undergoing a background 87171230Sdillon * I/O or be in a weird state), then don't 87271230Sdillon * cluster with it. 87334630Sjulian */ 87448225Smckusick if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 87548225Smckusick B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 87634694Sdyson != (B_DELWRI | B_CLUSTEROK | 87748225Smckusick (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 87871230Sdillon (tbp->b_flags & B_LOCKED) || 87948225Smckusick tbp->b_wcred != bp->b_wcred || 88048225Smckusick BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { 88112767Sdyson splx(s); 88212767Sdyson break; 88312767Sdyson } 88412767Sdyson 88534630Sjulian /* 88634630Sjulian * Check that the combined cluster 88734630Sjulian * would make sense with regard to pages 88834630Sjulian * and would not be too large 88934630Sjulian */ 89012767Sdyson if ((tbp->b_bcount != size) || 89134630Sjulian ((bp->b_blkno + (dbsize * i)) != 89234694Sdyson tbp->b_blkno) || 89334630Sjulian ((tbp->b_npages + bp->b_npages) > 89451797Sphk (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { 89548225Smckusick BUF_UNLOCK(tbp); 89612767Sdyson splx(s); 89712767Sdyson break; 89812767Sdyson } 89934630Sjulian /* 90034630Sjulian * Ok, it's passed all the tests, 90134630Sjulian * so remove it from the free list 90234630Sjulian * and mark it busy. We will use it. 90334630Sjulian */ 90412767Sdyson bremfree(tbp); 90512767Sdyson tbp->b_flags &= ~B_DONE; 90612404Sdyson splx(s); 90734630Sjulian } /* end of code for non-first buffers only */ 90834266Sjulian /* check for latent dependencies to be handled */ 90961724Sphk if ((LIST_FIRST(&tbp->b_dep)) != NULL) 91061724Sphk buf_start(tbp); 91134630Sjulian /* 91234630Sjulian * If the IO is via the VM then we do some 91385272Sdillon * special VM hackery (yuck). Since the buffer's 91485272Sdillon * block size may not be page-aligned it is possible 91585272Sdillon * for a page to be shared between two buffers. We 91685272Sdillon * have to get rid of the duplication when building 91785272Sdillon * the cluster. 91834630Sjulian */ 91913490Sdyson if (tbp->b_flags & B_VMIO) { 92032937Sdyson vm_page_t m; 92132937Sdyson 92234630Sjulian if (i != 0) { /* if not first buffer */ 92332937Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 92432937Sdyson m = tbp->b_pages[j]; 92550701Stegge if (m->flags & PG_BUSY) { 92650701Stegge bqrelse(tbp); 92732937Sdyson goto finishcluster; 92850701Stegge } 92932937Sdyson } 93032937Sdyson } 93132937Sdyson 93213490Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 93313490Sdyson m = tbp->b_pages[j]; 93438799Sdfr vm_page_io_start(m); 93538517Sdfr vm_object_pip_add(m->object, 1); 93613490Sdyson if ((bp->b_npages == 0) || 93734630Sjulian (bp->b_pages[bp->b_npages - 1] != m)) { 93813490Sdyson bp->b_pages[bp->b_npages] = m; 93913490Sdyson bp->b_npages++; 94013490Sdyson } 94112767Sdyson } 94212767Sdyson } 94312767Sdyson bp->b_bcount += size; 94412767Sdyson bp->b_bufsize += size; 9451541Srgrimes 94638299Sdfr s = splbio(); 94744679Sjulian bundirty(tbp); 94858934Sphk tbp->b_flags &= ~B_DONE; 94958934Sphk tbp->b_ioflags &= ~BIO_ERROR; 95012767Sdyson tbp->b_flags |= B_ASYNC; 95158345Sphk tbp->b_iocmd = BIO_WRITE; 95212767Sdyson reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 95312767Sdyson ++tbp->b_vp->v_numoutput; 95438299Sdfr splx(s); 95548333Speter BUF_KERNPROC(tbp); 95612767Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 95712767Sdyson tbp, b_cluster.cluster_entry); 9581541Srgrimes } 95932937Sdyson finishcluster: 96012767Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 96112767Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 96220054Sdyson if (bp->b_bufsize > bp->b_kvasize) 96337559Sbde panic( 96437559Sbde "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 96537559Sbde bp->b_bufsize, bp->b_kvasize); 96620054Sdyson bp->b_kvasize = bp->b_bufsize; 96712767Sdyson totalwritten += bp->b_bufsize; 96817304Sdyson bp->b_dirtyoff = 0; 96917304Sdyson bp->b_dirtyend = bp->b_bufsize; 97012767Sdyson bawrite(bp); 9711541Srgrimes 97212767Sdyson len -= i; 9731541Srgrimes } 97412767Sdyson return totalwritten; 9751541Srgrimes} 9761541Srgrimes 9771541Srgrimes/* 9781541Srgrimes * Collect together all the buffers in a cluster. 9791541Srgrimes * Plus add one additional buffer. 9801541Srgrimes */ 98112973Sbdestatic struct cluster_save * 9821541Srgrimescluster_collectbufs(vp, last_bp) 9831541Srgrimes struct vnode *vp; 9841541Srgrimes struct buf *last_bp; 9851541Srgrimes{ 9861541Srgrimes struct cluster_save *buflist; 98741205Smckusick struct buf *bp; 9885455Sdg daddr_t lbn; 9891541Srgrimes int i, len; 9901541Srgrimes 9911541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 9921541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 9931541Srgrimes M_SEGMENT, M_WAITOK); 9941541Srgrimes buflist->bs_nchildren = 0; 9955455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 99641205Smckusick for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 99741205Smckusick (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 99841205Smckusick buflist->bs_children[i] = bp; 99941205Smckusick if (bp->b_blkno == bp->b_lblkno) 100041205Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 100141205Smckusick NULL, NULL); 100241205Smckusick } 100341529Smckusick buflist->bs_children[i] = bp = last_bp; 100441529Smckusick if (bp->b_blkno == bp->b_lblkno) 100541529Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 100641529Smckusick NULL, NULL); 10071541Srgrimes buflist->bs_nchildren = i + 1; 10081541Srgrimes return (buflist); 10091541Srgrimes} 1010