vfs_cluster.c revision 145734
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 4. Neither the name of the University nor the names of its contributors 161541Srgrimes * may be used to endorse or promote products derived from this software 171541Srgrimes * without specific prior written permission. 181541Srgrimes * 191541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 201541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 211541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 221541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 231541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 241541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 251541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 261541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 271541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 281541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 291541Srgrimes * SUCH DAMAGE. 301541Srgrimes * 311541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 321541Srgrimes */ 331541Srgrimes 34116182Sobrien#include <sys/cdefs.h> 35116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/vfs_cluster.c 145734 2005-05-01 01:01:17Z jeff $"); 36116182Sobrien 3732929Seivind#include "opt_debug_cluster.h" 3832929Seivind 391541Srgrimes#include <sys/param.h> 401549Srgrimes#include <sys/systm.h> 4141168Sbde#include <sys/kernel.h> 421541Srgrimes#include <sys/proc.h> 4360041Sphk#include <sys/bio.h> 441541Srgrimes#include <sys/buf.h> 451541Srgrimes#include <sys/vnode.h> 4641124Sdg#include <sys/malloc.h> 471541Srgrimes#include <sys/mount.h> 481541Srgrimes#include <sys/resourcevar.h> 4968885Sdillon#include <sys/vmmeter.h> 506621Sdg#include <vm/vm.h> 5110541Sdyson#include <vm/vm_object.h> 5210541Sdyson#include <vm/vm_page.h> 5348545Smckusick#include <sys/sysctl.h> 541541Srgrimes 5521002Sdyson#if defined(CLUSTERDEBUG) 5621002Sdysonstatic int rcluster= 0; 5791690SeivindSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, 5891690Seivind "Debug VFS clustering code"); 5921002Sdyson#endif 6021002Sdyson 6141124Sdgstatic MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 6241124Sdg 6312973Sbdestatic struct cluster_save * 6492723Salfred cluster_collectbufs(struct vnode *vp, struct buf *last_bp); 6512973Sbdestatic struct buf * 6692723Salfred cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, 6796572Sphk daddr_t blkno, long size, int run, struct buf *fbp); 68141628Sphkstatic void cluster_callback(struct buf *); 691541Srgrimes 7048545Smckusickstatic int write_behind = 1; 7191690SeivindSYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, 7291690Seivind "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); 7348545Smckusick 74112175Sjeffstatic int read_max = 8; 75112080SjeffSYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, 76112080Sjeff "Cluster read-ahead max block count"); 77112080Sjeff 7891690Seivind/* Page expended to mark partially backed buffers */ 7912973Sbdeextern vm_page_t bogus_page; 805455Sdg 8191690Seivind/* 8291690Seivind * Number of physical bufs (pbufs) this subsystem is allowed. 8391690Seivind * Manipulated by vm_pager.c 8491690Seivind */ 8542957Sdillonextern int cluster_pbuf_freecnt; 8642957Sdillon 871541Srgrimes/* 8891690Seivind * Read data to a buf, including read-ahead if we find this to be beneficial. 8991690Seivind * cluster_read replaces bread. 9010541Sdyson */ 911549Srgrimesint 9221002Sdysoncluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 931541Srgrimes struct vnode *vp; 941541Srgrimes u_quad_t filesize; 951541Srgrimes daddr_t lblkno; 961541Srgrimes long size; 971541Srgrimes struct ucred *cred; 9821002Sdyson long totread; 9921002Sdyson int seqcount; 1001541Srgrimes struct buf **bpp; 1011541Srgrimes{ 10221002Sdyson struct buf *bp, *rbp, *reqbp; 10396572Sphk daddr_t blkno, origblkno; 104112080Sjeff int maxra, racluster; 105112080Sjeff int error, ncontig; 10610541Sdyson int i; 1071541Srgrimes 1081541Srgrimes error = 0; 10921002Sdyson 1105455Sdg /* 11121002Sdyson * Try to limit the amount of read-ahead by a few 11221002Sdyson * ad-hoc parameters. This needs work!!! 11321002Sdyson */ 11451797Sphk racluster = vp->v_mount->mnt_iosize_max / size; 115112080Sjeff maxra = seqcount; 116112080Sjeff maxra = min(read_max, maxra); 117112080Sjeff maxra = min(nbuf/8, maxra); 118112080Sjeff if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) 119112080Sjeff maxra = (filesize / size) - lblkno; 12021002Sdyson 12121002Sdyson /* 1225455Sdg * get the requested block 1235455Sdg */ 124111856Sjeff *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0); 12521002Sdyson origblkno = lblkno; 12612767Sdyson 1275455Sdg /* 1285455Sdg * if it is in the cache, then check to see if the reads have been 1295455Sdg * sequential. If they have, then try some read-ahead, otherwise 1305455Sdg * back-off on prospective read-aheads. 1315455Sdg */ 1321541Srgrimes if (bp->b_flags & B_CACHE) { 13321002Sdyson if (!seqcount) { 1345455Sdg return 0; 13521002Sdyson } else if ((bp->b_flags & B_RAM) == 0) { 13621002Sdyson return 0; 13721002Sdyson } else { 13821002Sdyson bp->b_flags &= ~B_RAM; 139103931Sjeff VI_LOCK(vp); 14048225Smckusick for (i = 1; i < maxra; i++) { 14199737Sdillon /* 14299737Sdillon * Stop if the buffer does not exist or it 14399737Sdillon * is invalid (about to go away?) 14499737Sdillon */ 145136767Sphk rbp = gbincore(&vp->v_bufobj, lblkno+i); 146112080Sjeff if (rbp == NULL || (rbp->b_flags & B_INVAL)) 14721002Sdyson break; 14821002Sdyson 14921002Sdyson /* 15048677Smckusick * Set another read-ahead mark so we know 15148677Smckusick * to check again. 15221002Sdyson */ 15321002Sdyson if (((i % racluster) == (racluster - 1)) || 15421002Sdyson (i == (maxra - 1))) 155112080Sjeff rbp->b_flags |= B_RAM; 15621002Sdyson } 157103931Sjeff VI_UNLOCK(vp); 15821002Sdyson if (i >= maxra) { 1595839Sdg return 0; 16010541Sdyson } 16121002Sdyson lblkno += i; 16221002Sdyson } 16321002Sdyson reqbp = bp = NULL; 164111886Sjeff /* 165111886Sjeff * If it isn't in the cache, then get a chunk from 166111886Sjeff * disk if sequential, otherwise just get the block. 167111886Sjeff */ 16821002Sdyson } else { 16942453Seivind off_t firstread = bp->b_offset; 170111886Sjeff int nblks; 17142453Seivind 17242408Seivind KASSERT(bp->b_offset != NOOFFSET, 17342453Seivind ("cluster_read: no buffer offset")); 174111886Sjeff 175112080Sjeff ncontig = 0; 176111886Sjeff 177111886Sjeff /* 178111886Sjeff * Compute the total number of blocks that we should read 179111886Sjeff * synchronously. 180111886Sjeff */ 18121002Sdyson if (firstread + totread > filesize) 18221002Sdyson totread = filesize - firstread; 183111886Sjeff nblks = howmany(totread, size); 184111886Sjeff if (nblks > racluster) 185111886Sjeff nblks = racluster; 18621002Sdyson 187111886Sjeff /* 188111886Sjeff * Now compute the number of contiguous blocks. 189111886Sjeff */ 190111886Sjeff if (nblks > 1) { 19121002Sdyson error = VOP_BMAP(vp, lblkno, NULL, 192112080Sjeff &blkno, &ncontig, NULL); 193111886Sjeff /* 194111886Sjeff * If this failed to map just do the original block. 195111886Sjeff */ 196111886Sjeff if (error || blkno == -1) 197112080Sjeff ncontig = 0; 198111886Sjeff } 19921002Sdyson 200111886Sjeff /* 201111886Sjeff * If we have contiguous data available do a cluster 202111886Sjeff * otherwise just read the requested block. 203111886Sjeff */ 204112080Sjeff if (ncontig) { 205111886Sjeff /* Account for our first block. */ 206112080Sjeff ncontig = min(ncontig + 1, nblks); 207112080Sjeff if (ncontig < nblks) 208112080Sjeff nblks = ncontig; 20921002Sdyson bp = cluster_rbuild(vp, filesize, lblkno, 21021002Sdyson blkno, size, nblks, bp); 21134694Sdyson lblkno += (bp->b_bufsize / size); 21210541Sdyson } else { 21358345Sphk bp->b_flags |= B_RAM; 21458345Sphk bp->b_iocmd = BIO_READ; 21510541Sdyson lblkno += 1; 2168876Srgrimes } 2171541Srgrimes } 2185455Sdg 2195455Sdg /* 220112080Sjeff * handle the synchronous read so that it is available ASAP. 2215455Sdg */ 2225455Sdg if (bp) { 22370374Sdillon if ((bp->b_flags & B_CLUSTER) == 0) { 22436275Sdyson vfs_busy_pages(bp, 0); 22570374Sdillon } 22658934Sphk bp->b_flags &= ~B_INVAL; 22758934Sphk bp->b_ioflags &= ~BIO_ERROR; 22858345Sphk if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) 22948333Speter BUF_KERNPROC(bp); 230121205Sphk bp->b_iooffset = dbtob(bp->b_blkno); 231136927Sphk bstrategy(bp); 23236275Sdyson curproc->p_stats->p_ru.ru_inblock++; 2335455Sdg } 23434611Sdyson 2355455Sdg /* 236112080Sjeff * If we have been doing sequential I/O, then do some read-ahead. 2375455Sdg */ 238112080Sjeff while (lblkno < (origblkno + maxra)) { 239112080Sjeff error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); 240112080Sjeff if (error) 241112080Sjeff break; 242112080Sjeff 243112080Sjeff if (blkno == -1) 244112080Sjeff break; 245112080Sjeff 246112080Sjeff /* 247112080Sjeff * We could throttle ncontig here by maxra but we might as 248112080Sjeff * well read the data if it is contiguous. We're throttled 249112080Sjeff * by racluster anyway. 250112080Sjeff */ 251112080Sjeff if (ncontig) { 252112080Sjeff ncontig = min(ncontig + 1, racluster); 253112080Sjeff rbp = cluster_rbuild(vp, filesize, lblkno, blkno, 254112080Sjeff size, ncontig, NULL); 255112080Sjeff lblkno += (rbp->b_bufsize / size); 256112838Sjeff if (rbp->b_flags & B_DELWRI) { 257112838Sjeff bqrelse(rbp); 258112838Sjeff continue; 259112838Sjeff } 260112080Sjeff } else { 261112080Sjeff rbp = getblk(vp, lblkno, size, 0, 0, 0); 262112838Sjeff lblkno += 1; 263112838Sjeff if (rbp->b_flags & B_DELWRI) { 264112838Sjeff bqrelse(rbp); 265112838Sjeff continue; 266112838Sjeff } 267112080Sjeff rbp->b_flags |= B_ASYNC | B_RAM; 268112080Sjeff rbp->b_iocmd = BIO_READ; 269112080Sjeff rbp->b_blkno = blkno; 270112080Sjeff } 271112080Sjeff if (rbp->b_flags & B_CACHE) { 27258345Sphk rbp->b_flags &= ~B_ASYNC; 27313490Sdyson bqrelse(rbp); 274112080Sjeff continue; 2755455Sdg } 276112080Sjeff if ((rbp->b_flags & B_CLUSTER) == 0) { 277112080Sjeff vfs_busy_pages(rbp, 0); 278112080Sjeff } 279112080Sjeff rbp->b_flags &= ~B_INVAL; 280112080Sjeff rbp->b_ioflags &= ~BIO_ERROR; 281112080Sjeff if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) 282112080Sjeff BUF_KERNPROC(rbp); 283121205Sphk rbp->b_iooffset = dbtob(rbp->b_blkno); 284136927Sphk bstrategy(rbp); 285112080Sjeff curproc->p_stats->p_ru.ru_inblock++; 2865455Sdg } 287112080Sjeff 28821002Sdyson if (reqbp) 28959762Sphk return (bufwait(reqbp)); 29021002Sdyson else 29121002Sdyson return (error); 2921541Srgrimes} 2931541Srgrimes 2941541Srgrimes/* 2951541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 2961541Srgrimes * read ahead. We will read as many blocks as possible sequentially 2971541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 2981541Srgrimes */ 29910541Sdysonstatic struct buf * 30021002Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 3011541Srgrimes struct vnode *vp; 3021541Srgrimes u_quad_t filesize; 3031541Srgrimes daddr_t lbn; 30496572Sphk daddr_t blkno; 3051541Srgrimes long size; 3061541Srgrimes int run; 30721002Sdyson struct buf *fbp; 3081541Srgrimes{ 30910541Sdyson struct buf *bp, *tbp; 3101541Srgrimes daddr_t bn; 31140648Sphk int i, inc, j; 3121541Srgrimes 31342408Seivind KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 314122537Smckusick ("cluster_rbuild: size %ld != filesize %jd\n", 315122537Smckusick size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); 31642453Seivind 31712767Sdyson /* 31812767Sdyson * avoid a division 31912767Sdyson */ 32012767Sdyson while ((u_quad_t) size * (lbn + run) > filesize) { 3211541Srgrimes --run; 32212767Sdyson } 32310541Sdyson 32421002Sdyson if (fbp) { 32521002Sdyson tbp = fbp; 32658345Sphk tbp->b_iocmd = BIO_READ; 32721002Sdyson } else { 328111856Sjeff tbp = getblk(vp, lbn, size, 0, 0, 0); 32921002Sdyson if (tbp->b_flags & B_CACHE) 33021002Sdyson return tbp; 33158345Sphk tbp->b_flags |= B_ASYNC | B_RAM; 33258345Sphk tbp->b_iocmd = BIO_READ; 33321002Sdyson } 33410541Sdyson 33510541Sdyson tbp->b_blkno = blkno; 33616086Sdyson if( (tbp->b_flags & B_MALLOC) || 33716086Sdyson ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 33810541Sdyson return tbp; 33910541Sdyson 34042957Sdillon bp = trypbuf(&cluster_pbuf_freecnt); 34110541Sdyson if (bp == 0) 34210541Sdyson return tbp; 34310541Sdyson 34485272Sdillon /* 34585272Sdillon * We are synthesizing a buffer out of vm_page_t's, but 34685272Sdillon * if the block size is not page aligned then the starting 34785272Sdillon * address may not be either. Inherit the b_data offset 34885272Sdillon * from the original buffer. 34985272Sdillon */ 35037467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 35137467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 35258345Sphk bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; 35358345Sphk bp->b_iocmd = BIO_READ; 3545455Sdg bp->b_iodone = cluster_callback; 3555455Sdg bp->b_blkno = blkno; 3565455Sdg bp->b_lblkno = lbn; 35734611Sdyson bp->b_offset = tbp->b_offset; 35842453Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 3595455Sdg pbgetvp(vp, bp); 3601541Srgrimes 36112404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 3621541Srgrimes 3635455Sdg bp->b_bcount = 0; 3645455Sdg bp->b_bufsize = 0; 3655455Sdg bp->b_npages = 0; 3665455Sdg 3671541Srgrimes inc = btodb(size); 36810541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3695455Sdg if (i != 0) { 37012767Sdyson if ((bp->b_npages * PAGE_SIZE) + 37185272Sdillon round_page(size) > vp->v_mount->mnt_iosize_max) { 37210541Sdyson break; 37385272Sdillon } 37410978Sdyson 375111886Sjeff tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT); 37612767Sdyson 377111886Sjeff /* Don't wait around for locked bufs. */ 378111886Sjeff if (tbp == NULL) 379111886Sjeff break; 38034611Sdyson 38171230Sdillon /* 38285272Sdillon * Stop scanning if the buffer is fully valid 38385272Sdillon * (marked B_CACHE), or locked (may be doing a 38485272Sdillon * background write), or if the buffer is not 38585272Sdillon * VMIO backed. The clustering code can only deal 38685272Sdillon * with VMIO-backed buffers. 38771230Sdillon */ 388136989Sphk VI_LOCK(vp); 389119521Sjeff if ((tbp->b_vflags & BV_BKGRDINPROG) || 390119521Sjeff (tbp->b_flags & B_CACHE) || 391119521Sjeff (tbp->b_flags & B_VMIO) == 0) { 392136989Sphk VI_UNLOCK(vp); 39313490Sdyson bqrelse(tbp); 3945455Sdg break; 3955455Sdg } 396136989Sphk VI_UNLOCK(vp); 39710541Sdyson 39885272Sdillon /* 39985272Sdillon * The buffer must be completely invalid in order to 40085272Sdillon * take part in the cluster. If it is partially valid 40185272Sdillon * then we stop. 40285272Sdillon */ 403137010Sphk VM_OBJECT_LOCK(tbp->b_bufobj->bo_object); 40471230Sdillon for (j = 0;j < tbp->b_npages; j++) { 405121269Salc VM_OBJECT_LOCK_ASSERT(tbp->b_pages[j]->object, 406121269Salc MA_OWNED); 40734611Sdyson if (tbp->b_pages[j]->valid) 40810541Sdyson break; 40971230Sdillon } 410137010Sphk VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object); 41110541Sdyson if (j != tbp->b_npages) { 41234611Sdyson bqrelse(tbp); 41310541Sdyson break; 41410541Sdyson } 41510541Sdyson 41685272Sdillon /* 41785272Sdillon * Set a read-ahead mark as appropriate 41885272Sdillon */ 41921002Sdyson if ((fbp && (i == 1)) || (i == (run - 1))) 42021002Sdyson tbp->b_flags |= B_RAM; 42185272Sdillon 42285272Sdillon /* 42385272Sdillon * Set the buffer up for an async read (XXX should 42485272Sdillon * we do this only if we do not wind up brelse()ing?). 42585272Sdillon * Set the block number if it isn't set, otherwise 42685272Sdillon * if it is make sure it matches the block number we 42785272Sdillon * expect. 42885272Sdillon */ 42958345Sphk tbp->b_flags |= B_ASYNC; 43058345Sphk tbp->b_iocmd = BIO_READ; 43112767Sdyson if (tbp->b_blkno == tbp->b_lblkno) { 43210541Sdyson tbp->b_blkno = bn; 43310541Sdyson } else if (tbp->b_blkno != bn) { 43410541Sdyson brelse(tbp); 43510541Sdyson break; 43610541Sdyson } 4371541Srgrimes } 43848333Speter /* 43948333Speter * XXX fbp from caller may not be B_ASYNC, but we are going 44048333Speter * to biodone() it in cluster_callback() anyway 44148333Speter */ 44248333Speter BUF_KERNPROC(tbp); 44312404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 44412404Sdyson tbp, b_cluster.cluster_entry); 445137010Sphk VM_OBJECT_LOCK(tbp->b_bufobj->bo_object); 4465455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 44710541Sdyson vm_page_t m; 44810541Sdyson m = tbp->b_pages[j]; 44938799Sdfr vm_page_io_start(m); 45038517Sdfr vm_object_pip_add(m->object, 1); 45110541Sdyson if ((bp->b_npages == 0) || 45212413Sdyson (bp->b_pages[bp->b_npages-1] != m)) { 45310541Sdyson bp->b_pages[bp->b_npages] = m; 45410541Sdyson bp->b_npages++; 45510541Sdyson } 45618737Sdyson if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 45718737Sdyson tbp->b_pages[j] = bogus_page; 4581541Srgrimes } 459137010Sphk VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object); 46085511Sdillon /* 46185511Sdillon * XXX shouldn't this be += size for both, like in 46285511Sdillon * cluster_wbuild()? 46385511Sdillon * 46485511Sdillon * Don't inherit tbp->b_bufsize as it may be larger due to 46585511Sdillon * a non-page-aligned size. Instead just aggregate using 46685511Sdillon * 'size'. 46785511Sdillon */ 46885511Sdillon if (tbp->b_bcount != size) 46985511Sdillon printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); 47085511Sdillon if (tbp->b_bufsize != size) 47185511Sdillon printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); 47285511Sdillon bp->b_bcount += size; 47385511Sdillon bp->b_bufsize += size; 4741541Srgrimes } 47518737Sdyson 47685272Sdillon /* 47785272Sdillon * Fully valid pages in the cluster are already good and do not need 47885272Sdillon * to be re-read from disk. Replace the page with bogus_page 47985272Sdillon */ 480137010Sphk VM_OBJECT_LOCK(bp->b_bufobj->bo_object); 48185272Sdillon for (j = 0; j < bp->b_npages; j++) { 482121269Salc VM_OBJECT_LOCK_ASSERT(bp->b_pages[j]->object, MA_OWNED); 48318737Sdyson if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 48485272Sdillon VM_PAGE_BITS_ALL) { 48518737Sdyson bp->b_pages[j] = bogus_page; 48685272Sdillon } 48718737Sdyson } 488137010Sphk VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object); 48920054Sdyson if (bp->b_bufsize > bp->b_kvasize) 49037559Sbde panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 49137559Sbde bp->b_bufsize, bp->b_kvasize); 49220054Sdyson bp->b_kvasize = bp->b_bufsize; 49318737Sdyson 49410541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 49510541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 4965455Sdg return (bp); 4971541Srgrimes} 4981541Srgrimes 4991541Srgrimes/* 5001541Srgrimes * Cleanup after a clustered read or write. 5011541Srgrimes * This is complicated by the fact that any of the buffers might have 5021541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 5031541Srgrimes * that we will need to shift around. 5041541Srgrimes */ 505141628Sphkstatic void 5061541Srgrimescluster_callback(bp) 5071541Srgrimes struct buf *bp; 5081541Srgrimes{ 50912404Sdyson struct buf *nbp, *tbp; 5101541Srgrimes int error = 0; 5111541Srgrimes 5121541Srgrimes /* 5131541Srgrimes * Must propogate errors to all the components. 5141541Srgrimes */ 51558934Sphk if (bp->b_ioflags & BIO_ERROR) 5161541Srgrimes error = bp->b_error; 5171541Srgrimes 518145700Sjeff VM_LOCK_GIANT(); 51910541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 520145700Sjeff VM_UNLOCK_GIANT(); 5211541Srgrimes /* 5221541Srgrimes * Move memory from the large cluster buffer into the component 5231541Srgrimes * buffers and mark IO as done on these. 5241541Srgrimes */ 52521002Sdyson for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 52612404Sdyson tbp; tbp = nbp) { 52721002Sdyson nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 5281541Srgrimes if (error) { 52958934Sphk tbp->b_ioflags |= BIO_ERROR; 5301541Srgrimes tbp->b_error = error; 53146349Salc } else { 53246349Salc tbp->b_dirtyoff = tbp->b_dirtyend = 0; 53358934Sphk tbp->b_flags &= ~B_INVAL; 53458934Sphk tbp->b_ioflags &= ~BIO_ERROR; 53577115Sdillon /* 53677115Sdillon * XXX the bdwrite()/bqrelse() issued during 53777115Sdillon * cluster building clears B_RELBUF (see bqrelse() 53877115Sdillon * comment). If direct I/O was specified, we have 53977115Sdillon * to restore it here to allow the buffer and VM 54077115Sdillon * to be freed. 54177115Sdillon */ 54277115Sdillon if (tbp->b_flags & B_DIRECT) 54377115Sdillon tbp->b_flags |= B_RELBUF; 54446349Salc } 54559249Sphk bufdone(tbp); 5461541Srgrimes } 547137719Sphk pbrelvp(bp); 54842957Sdillon relpbuf(bp, &cluster_pbuf_freecnt); 5491541Srgrimes} 5501541Srgrimes 5511541Srgrimes/* 55248545Smckusick * cluster_wbuild_wb: 55348545Smckusick * 55448545Smckusick * Implement modified write build for cluster. 55548545Smckusick * 55648545Smckusick * write_behind = 0 write behind disabled 55748545Smckusick * write_behind = 1 write behind normal (default) 55848545Smckusick * write_behind = 2 write behind backed-off 55948545Smckusick */ 56048545Smckusick 56148545Smckusickstatic __inline int 56248545Smckusickcluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) 56348545Smckusick{ 56448545Smckusick int r = 0; 56548545Smckusick 56648545Smckusick switch(write_behind) { 56748545Smckusick case 2: 56848545Smckusick if (start_lbn < len) 56948545Smckusick break; 57048545Smckusick start_lbn -= len; 571102412Scharnier /* FALLTHROUGH */ 57248545Smckusick case 1: 57348545Smckusick r = cluster_wbuild(vp, size, start_lbn, len); 574102412Scharnier /* FALLTHROUGH */ 57548545Smckusick default: 576102412Scharnier /* FALLTHROUGH */ 57748545Smckusick break; 57848545Smckusick } 57948545Smckusick return(r); 58048545Smckusick} 58148545Smckusick 58248545Smckusick/* 5831541Srgrimes * Do clustered write for FFS. 5841541Srgrimes * 5851541Srgrimes * Three cases: 5861541Srgrimes * 1. Write is not sequential (write asynchronously) 5871541Srgrimes * Write is sequential: 5881541Srgrimes * 2. beginning of cluster - begin cluster 5891541Srgrimes * 3. middle of a cluster - add to cluster 5901541Srgrimes * 4. end of a cluster - asynchronously write cluster 5911541Srgrimes */ 5921541Srgrimesvoid 593135858Sphkcluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount) 5941541Srgrimes{ 5955455Sdg daddr_t lbn; 5965455Sdg int maxclen, cursize; 5975455Sdg int lblocksize; 59812404Sdyson int async; 5991541Srgrimes 60032286Sdyson if (vp->v_type == VREG) { 60132286Sdyson async = vp->v_mount->mnt_flag & MNT_ASYNC; 60232286Sdyson lblocksize = vp->v_mount->mnt_stat.f_iosize; 60332286Sdyson } else { 60432286Sdyson async = 0; 60532286Sdyson lblocksize = bp->b_bufsize; 60632286Sdyson } 6075455Sdg lbn = bp->b_lblkno; 60842408Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 60934694Sdyson 6101541Srgrimes /* Initialize vnode to beginning of file. */ 6111541Srgrimes if (lbn == 0) 6121541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 6131541Srgrimes 6145455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 6155455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 61651797Sphk maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; 6171541Srgrimes if (vp->v_clen != 0) { 6181541Srgrimes /* 6191541Srgrimes * Next block is not sequential. 6208876Srgrimes * 6211541Srgrimes * If we are not writing at end of file, the process 6225455Sdg * seeked to another point in the file since its last 6235455Sdg * write, or we have reached our maximum cluster size, 6245455Sdg * then push the previous cluster. Otherwise try 6255455Sdg * reallocating to make it sequential. 62658909Sdillon * 62758909Sdillon * Change to algorithm: only push previous cluster if 62858909Sdillon * it was sequential from the point of view of the 62958909Sdillon * seqcount heuristic, otherwise leave the buffer 63058909Sdillon * intact so we can potentially optimize the I/O 63158909Sdillon * later on in the buf_daemon or update daemon 63258909Sdillon * flush. 6331541Srgrimes */ 6341541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 63534611Sdyson if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 63610541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 63758909Sdillon if (!async && seqcount > 0) { 63848677Smckusick cluster_wbuild_wb(vp, lblocksize, 63912404Sdyson vp->v_cstart, cursize); 64058909Sdillon } 64110541Sdyson } else { 64210541Sdyson struct buf **bpp, **endbp; 64310541Sdyson struct cluster_save *buflist; 64410541Sdyson 64510541Sdyson buflist = cluster_collectbufs(vp, bp); 64610541Sdyson endbp = &buflist->bs_children 64710541Sdyson [buflist->bs_nchildren - 1]; 64810541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 64910541Sdyson /* 65058909Sdillon * Failed, push the previous cluster 65158909Sdillon * if *really* writing sequentially 65258909Sdillon * in the logical file (seqcount > 1), 65358909Sdillon * otherwise delay it in the hopes that 65458909Sdillon * the low level disk driver can 65558909Sdillon * optimize the write ordering. 65610541Sdyson */ 65710541Sdyson for (bpp = buflist->bs_children; 65810541Sdyson bpp < endbp; bpp++) 65910541Sdyson brelse(*bpp); 66010541Sdyson free(buflist, M_SEGMENT); 66158909Sdillon if (seqcount > 1) { 66258909Sdillon cluster_wbuild_wb(vp, 66358909Sdillon lblocksize, vp->v_cstart, 66458909Sdillon cursize); 66558909Sdillon } 66610541Sdyson } else { 66710541Sdyson /* 66810541Sdyson * Succeeded, keep building cluster. 66910541Sdyson */ 67010541Sdyson for (bpp = buflist->bs_children; 67110541Sdyson bpp <= endbp; bpp++) 67210541Sdyson bdwrite(*bpp); 67310541Sdyson free(buflist, M_SEGMENT); 67410541Sdyson vp->v_lastw = lbn; 67510541Sdyson vp->v_lasta = bp->b_blkno; 67610541Sdyson return; 67710541Sdyson } 67810541Sdyson } 6791541Srgrimes } 6801541Srgrimes /* 6815455Sdg * Consider beginning a cluster. If at end of file, make 6825455Sdg * cluster as large as possible, otherwise find size of 6835455Sdg * existing cluster. 6841541Srgrimes */ 68532286Sdyson if ((vp->v_type == VREG) && 68634611Sdyson ((u_quad_t) bp->b_offset + lblocksize) != filesize && 6877613Sdg (bp->b_blkno == bp->b_lblkno) && 68810551Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 68910541Sdyson bp->b_blkno == -1)) { 6901541Srgrimes bawrite(bp); 6911541Srgrimes vp->v_clen = 0; 6921541Srgrimes vp->v_lasta = bp->b_blkno; 6931541Srgrimes vp->v_cstart = lbn + 1; 6941541Srgrimes vp->v_lastw = lbn; 6951541Srgrimes return; 6961541Srgrimes } 6975455Sdg vp->v_clen = maxclen; 69812404Sdyson if (!async && maxclen == 0) { /* I/O not contiguous */ 6991541Srgrimes vp->v_cstart = lbn + 1; 70013490Sdyson bawrite(bp); 7015455Sdg } else { /* Wait for rest of cluster */ 7021541Srgrimes vp->v_cstart = lbn; 7035455Sdg bdwrite(bp); 7041541Srgrimes } 7051541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 7061541Srgrimes /* 70758909Sdillon * At end of cluster, write it out if seqcount tells us we 70858909Sdillon * are operating sequentially, otherwise let the buf or 70958909Sdillon * update daemon handle it. 7101541Srgrimes */ 71112404Sdyson bdwrite(bp); 71258909Sdillon if (seqcount > 1) 71358909Sdillon cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 7141541Srgrimes vp->v_clen = 0; 7151541Srgrimes vp->v_cstart = lbn + 1; 71668885Sdillon } else if (vm_page_count_severe()) { 71768885Sdillon /* 71868885Sdillon * We are low on memory, get it going NOW 71968885Sdillon */ 72068885Sdillon bawrite(bp); 72158909Sdillon } else { 7221541Srgrimes /* 7235455Sdg * In the middle of a cluster, so just delay the I/O for now. 7241541Srgrimes */ 7251541Srgrimes bdwrite(bp); 72658909Sdillon } 7271541Srgrimes vp->v_lastw = lbn; 7281541Srgrimes vp->v_lasta = bp->b_blkno; 7291541Srgrimes} 7301541Srgrimes 7311541Srgrimes 7321541Srgrimes/* 7331541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 7341541Srgrimes * The last lbn argument is the current block on which I/O is being 7351541Srgrimes * performed. Check to see that it doesn't fall in the middle of 7361541Srgrimes * the current block (if last_bp == NULL). 7371541Srgrimes */ 73812767Sdysonint 73912404Sdysoncluster_wbuild(vp, size, start_lbn, len) 7401541Srgrimes struct vnode *vp; 7411541Srgrimes long size; 7421541Srgrimes daddr_t start_lbn; 7431541Srgrimes int len; 7441541Srgrimes{ 74512404Sdyson struct buf *bp, *tbp; 746145734Sjeff int i, j; 74712767Sdyson int totalwritten = 0; 74812404Sdyson int dbsize = btodb(size); 74935595Sbde 75012767Sdyson while (len > 0) { 75171230Sdillon /* 75271230Sdillon * If the buffer is not delayed-write (i.e. dirty), or it 75371230Sdillon * is delayed-write but either locked or inval, it cannot 75472080Sasmodai * partake in the clustered write. 75571230Sdillon */ 756111886Sjeff VI_LOCK(vp); 757136767Sphk if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || 758119521Sjeff (tbp->b_vflags & BV_BKGRDINPROG)) { 759111886Sjeff VI_UNLOCK(vp); 76012767Sdyson ++start_lbn; 76112767Sdyson --len; 76212767Sdyson continue; 76312767Sdyson } 764111886Sjeff if (BUF_LOCK(tbp, 765111886Sjeff LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, VI_MTX(vp))) { 766111886Sjeff ++start_lbn; 767111886Sjeff --len; 768111886Sjeff continue; 769111886Sjeff } 770119521Sjeff if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { 771111886Sjeff BUF_UNLOCK(tbp); 772111886Sjeff ++start_lbn; 773111886Sjeff --len; 774111886Sjeff continue; 775111886Sjeff } 77612767Sdyson bremfree(tbp); 77712767Sdyson tbp->b_flags &= ~B_DONE; 7781541Srgrimes 77947967Sjulian /* 78047967Sjulian * Extra memory in the buffer, punt on this buffer. 78147967Sjulian * XXX we could handle this in most cases, but we would 78247967Sjulian * have to push the extra memory down to after our max 78347967Sjulian * possible cluster size and then potentially pull it back 78447967Sjulian * up if the cluster was terminated prematurely--too much 78547967Sjulian * hassle. 78647967Sjulian */ 78768868Stegge if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 78868868Stegge (B_CLUSTEROK | B_VMIO)) || 78934630Sjulian (tbp->b_bcount != tbp->b_bufsize) || 79034630Sjulian (tbp->b_bcount != size) || 79134630Sjulian (len == 1) || 79247948Sdg ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { 79312767Sdyson totalwritten += tbp->b_bufsize; 79412767Sdyson bawrite(tbp); 79512767Sdyson ++start_lbn; 79612767Sdyson --len; 79712767Sdyson continue; 79812767Sdyson } 79912404Sdyson 80034630Sjulian /* 80134630Sjulian * We got a pbuf to make the cluster in. 80234630Sjulian * so initialise it. 80334630Sjulian */ 80412767Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 80512767Sdyson bp->b_bcount = 0; 80612767Sdyson bp->b_bufsize = 0; 80712767Sdyson bp->b_npages = 0; 80884827Sjhb if (tbp->b_wcred != NOCRED) 80984827Sjhb bp->b_wcred = crhold(tbp->b_wcred); 8101541Srgrimes 81112767Sdyson bp->b_blkno = tbp->b_blkno; 81212767Sdyson bp->b_lblkno = tbp->b_lblkno; 81334611Sdyson bp->b_offset = tbp->b_offset; 81485272Sdillon 81585272Sdillon /* 81685272Sdillon * We are synthesizing a buffer out of vm_page_t's, but 81785272Sdillon * if the block size is not page aligned then the starting 81885272Sdillon * address may not be either. Inherit the b_data offset 81985272Sdillon * from the original buffer. 82085272Sdillon */ 82137467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 82237467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 82358345Sphk bp->b_flags |= B_CLUSTER | 824115456Sphk (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 82512767Sdyson bp->b_iodone = cluster_callback; 82612767Sdyson pbgetvp(vp, bp); 82734630Sjulian /* 82834630Sjulian * From this location in the file, scan forward to see 82934630Sjulian * if there are buffers with adjacent data that need to 83034630Sjulian * be written as well. 83134630Sjulian */ 83212767Sdyson for (i = 0; i < len; ++i, ++start_lbn) { 83334630Sjulian if (i != 0) { /* If not the first buffer */ 83434630Sjulian /* 83534630Sjulian * If the adjacent data is not even in core it 83634630Sjulian * can't need to be written. 83734630Sjulian */ 838111886Sjeff VI_LOCK(vp); 839136767Sphk if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || 840119521Sjeff (tbp->b_vflags & BV_BKGRDINPROG)) { 841111886Sjeff VI_UNLOCK(vp); 84212767Sdyson break; 84312767Sdyson } 8441541Srgrimes 84534630Sjulian /* 84634630Sjulian * If it IS in core, but has different 84771230Sdillon * characteristics, or is locked (which 84871230Sdillon * means it could be undergoing a background 84971230Sdillon * I/O or be in a weird state), then don't 85071230Sdillon * cluster with it. 85134630Sjulian */ 852111886Sjeff if (BUF_LOCK(tbp, 853111886Sjeff LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, 854145734Sjeff VI_MTX(vp))) 855111886Sjeff break; 856111886Sjeff 85748225Smckusick if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 85848225Smckusick B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 859111886Sjeff != (B_DELWRI | B_CLUSTEROK | 86048225Smckusick (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 861111886Sjeff tbp->b_wcred != bp->b_wcred) { 862112347Sjeff BUF_UNLOCK(tbp); 86312767Sdyson break; 86412767Sdyson } 86512767Sdyson 86634630Sjulian /* 86734630Sjulian * Check that the combined cluster 86834630Sjulian * would make sense with regard to pages 86934630Sjulian * and would not be too large 87034630Sjulian */ 87112767Sdyson if ((tbp->b_bcount != size) || 87234630Sjulian ((bp->b_blkno + (dbsize * i)) != 87334694Sdyson tbp->b_blkno) || 87434630Sjulian ((tbp->b_npages + bp->b_npages) > 87551797Sphk (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { 87648225Smckusick BUF_UNLOCK(tbp); 87712767Sdyson break; 87812767Sdyson } 87934630Sjulian /* 88034630Sjulian * Ok, it's passed all the tests, 88134630Sjulian * so remove it from the free list 88234630Sjulian * and mark it busy. We will use it. 88334630Sjulian */ 88412767Sdyson bremfree(tbp); 88512767Sdyson tbp->b_flags &= ~B_DONE; 88634630Sjulian } /* end of code for non-first buffers only */ 88734266Sjulian /* check for latent dependencies to be handled */ 888115365Siedowse if ((LIST_FIRST(&tbp->b_dep)) != NULL) { 889115365Siedowse tbp->b_iocmd = BIO_WRITE; 89061724Sphk buf_start(tbp); 891115365Siedowse } 89234630Sjulian /* 89334630Sjulian * If the IO is via the VM then we do some 89485272Sdillon * special VM hackery (yuck). Since the buffer's 89585272Sdillon * block size may not be page-aligned it is possible 89685272Sdillon * for a page to be shared between two buffers. We 89785272Sdillon * have to get rid of the duplication when building 89885272Sdillon * the cluster. 89934630Sjulian */ 90013490Sdyson if (tbp->b_flags & B_VMIO) { 90132937Sdyson vm_page_t m; 90232937Sdyson 903137010Sphk VM_OBJECT_LOCK(tbp->b_bufobj->bo_object); 90434630Sjulian if (i != 0) { /* if not first buffer */ 90532937Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 90632937Sdyson m = tbp->b_pages[j]; 90750701Stegge if (m->flags & PG_BUSY) { 908136985Salc VM_OBJECT_UNLOCK( 909136985Salc tbp->b_object); 91050701Stegge bqrelse(tbp); 91132937Sdyson goto finishcluster; 91250701Stegge } 91332937Sdyson } 91432937Sdyson } 91513490Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 91613490Sdyson m = tbp->b_pages[j]; 91738799Sdfr vm_page_io_start(m); 91838517Sdfr vm_object_pip_add(m->object, 1); 91913490Sdyson if ((bp->b_npages == 0) || 92034630Sjulian (bp->b_pages[bp->b_npages - 1] != m)) { 92113490Sdyson bp->b_pages[bp->b_npages] = m; 92213490Sdyson bp->b_npages++; 92313490Sdyson } 92412767Sdyson } 925137010Sphk VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object); 92612767Sdyson } 92712767Sdyson bp->b_bcount += size; 92812767Sdyson bp->b_bufsize += size; 92944679Sjulian bundirty(tbp); 93058934Sphk tbp->b_flags &= ~B_DONE; 93158934Sphk tbp->b_ioflags &= ~BIO_ERROR; 93212767Sdyson tbp->b_flags |= B_ASYNC; 93358345Sphk tbp->b_iocmd = BIO_WRITE; 934132640Sphk reassignbuf(tbp); /* put on clean list */ 935136767Sphk bufobj_wref(tbp->b_bufobj); 93648333Speter BUF_KERNPROC(tbp); 93712767Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 93812767Sdyson tbp, b_cluster.cluster_entry); 9391541Srgrimes } 94032937Sdyson finishcluster: 94112767Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 94212767Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 94320054Sdyson if (bp->b_bufsize > bp->b_kvasize) 94437559Sbde panic( 94537559Sbde "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 94637559Sbde bp->b_bufsize, bp->b_kvasize); 94720054Sdyson bp->b_kvasize = bp->b_bufsize; 94812767Sdyson totalwritten += bp->b_bufsize; 94917304Sdyson bp->b_dirtyoff = 0; 95017304Sdyson bp->b_dirtyend = bp->b_bufsize; 95112767Sdyson bawrite(bp); 9521541Srgrimes 95312767Sdyson len -= i; 9541541Srgrimes } 95512767Sdyson return totalwritten; 9561541Srgrimes} 9571541Srgrimes 9581541Srgrimes/* 9591541Srgrimes * Collect together all the buffers in a cluster. 9601541Srgrimes * Plus add one additional buffer. 9611541Srgrimes */ 96212973Sbdestatic struct cluster_save * 9631541Srgrimescluster_collectbufs(vp, last_bp) 9641541Srgrimes struct vnode *vp; 9651541Srgrimes struct buf *last_bp; 9661541Srgrimes{ 9671541Srgrimes struct cluster_save *buflist; 96841205Smckusick struct buf *bp; 9695455Sdg daddr_t lbn; 9701541Srgrimes int i, len; 9711541Srgrimes 9721541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 9731541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 974111119Simp M_SEGMENT, M_WAITOK); 9751541Srgrimes buflist->bs_nchildren = 0; 9765455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 97741205Smckusick for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 97841205Smckusick (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 97941205Smckusick buflist->bs_children[i] = bp; 98041205Smckusick if (bp->b_blkno == bp->b_lblkno) 981136989Sphk VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, 98241205Smckusick NULL, NULL); 98341205Smckusick } 98441529Smckusick buflist->bs_children[i] = bp = last_bp; 98541529Smckusick if (bp->b_blkno == bp->b_lblkno) 986136989Sphk VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 9871541Srgrimes buflist->bs_nchildren = i + 1; 9881541Srgrimes return (buflist); 9891541Srgrimes} 990