vfs_cluster.c revision 59249
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 3650477Speter * $FreeBSD: head/sys/kern/vfs_cluster.c 59249 2000-04-15 05:54:02Z phk $ 371541Srgrimes */ 381541Srgrimes 3932929Seivind#include "opt_debug_cluster.h" 4032929Seivind 411541Srgrimes#include <sys/param.h> 421549Srgrimes#include <sys/systm.h> 4341168Sbde#include <sys/kernel.h> 441541Srgrimes#include <sys/proc.h> 451541Srgrimes#include <sys/buf.h> 461541Srgrimes#include <sys/vnode.h> 4741124Sdg#include <sys/malloc.h> 481541Srgrimes#include <sys/mount.h> 491541Srgrimes#include <sys/resourcevar.h> 506621Sdg#include <vm/vm.h> 5110541Sdyson#include <vm/vm_object.h> 5210541Sdyson#include <vm/vm_page.h> 5348545Smckusick#include <sys/sysctl.h> 541541Srgrimes 5521002Sdyson#if defined(CLUSTERDEBUG) 5621002Sdyson#include <sys/sysctl.h> 5721002Sdysonstatic int rcluster= 0; 5824484SbdeSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 5921002Sdyson#endif 6021002Sdyson 6141124Sdgstatic MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 6241124Sdg 6312973Sbdestatic struct cluster_save * 6412973Sbde cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 6512973Sbdestatic struct buf * 6612973Sbde cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 6721002Sdyson daddr_t blkno, long size, int run, struct buf *fbp)); 681541Srgrimes 6948545Smckusickstatic int write_behind = 1; 7048545SmckusickSYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, ""); 7148545Smckusick 7212973Sbdeextern vm_page_t bogus_page; 735455Sdg 7442957Sdillonextern int cluster_pbuf_freecnt; 7542957Sdillon 761541Srgrimes/* 7721002Sdyson * Maximum number of blocks for read-ahead. 781541Srgrimes */ 7921002Sdyson#define MAXRA 32 805455Sdg 811541Srgrimes/* 8221002Sdyson * This replaces bread. 8310541Sdyson */ 841549Srgrimesint 8521002Sdysoncluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 861541Srgrimes struct vnode *vp; 871541Srgrimes u_quad_t filesize; 881541Srgrimes daddr_t lblkno; 891541Srgrimes long size; 901541Srgrimes struct ucred *cred; 9121002Sdyson long totread; 9221002Sdyson int seqcount; 931541Srgrimes struct buf **bpp; 941541Srgrimes{ 9521002Sdyson struct buf *bp, *rbp, *reqbp; 9631016Sphk daddr_t blkno, origblkno; 9721002Sdyson int error, num_ra; 9810541Sdyson int i; 9921002Sdyson int maxra, racluster; 10021002Sdyson long origtotread; 1011541Srgrimes 1021541Srgrimes error = 0; 10321002Sdyson 1045455Sdg /* 10521002Sdyson * Try to limit the amount of read-ahead by a few 10621002Sdyson * ad-hoc parameters. This needs work!!! 10721002Sdyson */ 10851797Sphk racluster = vp->v_mount->mnt_iosize_max / size; 10921002Sdyson maxra = 2 * racluster + (totread / size); 11021002Sdyson if (maxra > MAXRA) 11121002Sdyson maxra = MAXRA; 11221002Sdyson if (maxra > nbuf/8) 11321002Sdyson maxra = nbuf/8; 11421002Sdyson 11521002Sdyson /* 1165455Sdg * get the requested block 1175455Sdg */ 11821002Sdyson *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 11921002Sdyson origblkno = lblkno; 12021002Sdyson origtotread = totread; 12112767Sdyson 1225455Sdg /* 1235455Sdg * if it is in the cache, then check to see if the reads have been 1245455Sdg * sequential. If they have, then try some read-ahead, otherwise 1255455Sdg * back-off on prospective read-aheads. 1265455Sdg */ 1271541Srgrimes if (bp->b_flags & B_CACHE) { 12821002Sdyson if (!seqcount) { 1295455Sdg return 0; 13021002Sdyson } else if ((bp->b_flags & B_RAM) == 0) { 13121002Sdyson return 0; 13221002Sdyson } else { 13321002Sdyson int s; 13421002Sdyson struct buf *tbp; 13521002Sdyson bp->b_flags &= ~B_RAM; 13621002Sdyson /* 13721002Sdyson * We do the spl here so that there is no window 13821002Sdyson * between the incore and the b_usecount increment 13921002Sdyson * below. We opt to keep the spl out of the loop 14021002Sdyson * for efficiency. 14121002Sdyson */ 14221002Sdyson s = splbio(); 14348225Smckusick for (i = 1; i < maxra; i++) { 14421002Sdyson 14521002Sdyson if (!(tbp = incore(vp, lblkno+i))) { 14621002Sdyson break; 14721002Sdyson } 14821002Sdyson 14921002Sdyson /* 15048677Smckusick * Set another read-ahead mark so we know 15148677Smckusick * to check again. 15221002Sdyson */ 15321002Sdyson if (((i % racluster) == (racluster - 1)) || 15421002Sdyson (i == (maxra - 1))) 15521002Sdyson tbp->b_flags |= B_RAM; 15621002Sdyson } 15721002Sdyson splx(s); 15821002Sdyson if (i >= maxra) { 1595839Sdg return 0; 16010541Sdyson } 16121002Sdyson lblkno += i; 16221002Sdyson } 16321002Sdyson reqbp = bp = NULL; 16421002Sdyson } else { 16542453Seivind off_t firstread = bp->b_offset; 16642453Seivind 16742408Seivind KASSERT(bp->b_offset != NOOFFSET, 16842453Seivind ("cluster_read: no buffer offset")); 16921002Sdyson if (firstread + totread > filesize) 17021002Sdyson totread = filesize - firstread; 17121002Sdyson if (totread > size) { 17221002Sdyson int nblks = 0; 17321002Sdyson int ncontigafter; 17421002Sdyson while (totread > 0) { 17521002Sdyson nblks++; 17621002Sdyson totread -= size; 17721002Sdyson } 17821002Sdyson if (nblks == 1) 17921002Sdyson goto single_block_read; 18021002Sdyson if (nblks > racluster) 18121002Sdyson nblks = racluster; 18221002Sdyson 18321002Sdyson error = VOP_BMAP(vp, lblkno, NULL, 18421002Sdyson &blkno, &ncontigafter, NULL); 18521002Sdyson if (error) 18621002Sdyson goto single_block_read; 18721002Sdyson if (blkno == -1) 18821002Sdyson goto single_block_read; 18921002Sdyson if (ncontigafter == 0) 19021002Sdyson goto single_block_read; 19121002Sdyson if (ncontigafter + 1 < nblks) 19221002Sdyson nblks = ncontigafter + 1; 19321002Sdyson 19421002Sdyson bp = cluster_rbuild(vp, filesize, lblkno, 19521002Sdyson blkno, size, nblks, bp); 19634694Sdyson lblkno += (bp->b_bufsize / size); 19710541Sdyson } else { 19821002Sdysonsingle_block_read: 19921002Sdyson /* 20021002Sdyson * if it isn't in the cache, then get a chunk from 20121002Sdyson * disk if sequential, otherwise just get the block. 20221002Sdyson */ 20358345Sphk bp->b_flags |= B_RAM; 20458345Sphk bp->b_iocmd = BIO_READ; 20510541Sdyson lblkno += 1; 2068876Srgrimes } 2071541Srgrimes } 2085455Sdg 2095455Sdg /* 2105455Sdg * if we have been doing sequential I/O, then do some read-ahead 2115455Sdg */ 21221002Sdyson rbp = NULL; 21321002Sdyson if (seqcount && (lblkno < (origblkno + seqcount))) { 2141541Srgrimes /* 21521002Sdyson * we now build the read-ahead buffer if it is desirable. 2161541Srgrimes */ 21721002Sdyson if (((u_quad_t)(lblkno + 1) * size) <= filesize && 21821002Sdyson !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 21921002Sdyson blkno != -1) { 22021002Sdyson int nblksread; 22121002Sdyson int ntoread = num_ra + 1; 22221002Sdyson nblksread = (origtotread + size - 1) / size; 22321002Sdyson if (seqcount < nblksread) 22421002Sdyson seqcount = nblksread; 22521002Sdyson if (seqcount < ntoread) 22621002Sdyson ntoread = seqcount; 22721002Sdyson if (num_ra) { 22821002Sdyson rbp = cluster_rbuild(vp, filesize, lblkno, 22921002Sdyson blkno, size, ntoread, NULL); 23021002Sdyson } else { 23121002Sdyson rbp = getblk(vp, lblkno, size, 0, 0); 23258345Sphk rbp->b_flags |= B_ASYNC | B_RAM; 23358345Sphk rbp->b_iocmd = BIO_READ; 23421002Sdyson rbp->b_blkno = blkno; 2355455Sdg } 2361541Srgrimes } 2375455Sdg } 2381541Srgrimes 2395455Sdg /* 24010541Sdyson * handle the synchronous read 2415455Sdg */ 2425455Sdg if (bp) { 24321002Sdyson#if defined(CLUSTERDEBUG) 24436275Sdyson if (rcluster) 24537951Sbde printf("S(%ld,%ld,%d) ", 24637951Sbde (long)bp->b_lblkno, bp->b_bcount, seqcount); 24721002Sdyson#endif 24836275Sdyson if ((bp->b_flags & B_CLUSTER) == 0) 24936275Sdyson vfs_busy_pages(bp, 0); 25058934Sphk bp->b_flags &= ~B_INVAL; 25158934Sphk bp->b_ioflags &= ~BIO_ERROR; 25258345Sphk if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) 25348333Speter BUF_KERNPROC(bp); 25437384Sjulian error = VOP_STRATEGY(vp, bp); 25536275Sdyson curproc->p_stats->p_ru.ru_inblock++; 2565455Sdg } 25734611Sdyson 2585455Sdg /* 2595455Sdg * and if we have read-aheads, do them too 2605455Sdg */ 2615455Sdg if (rbp) { 26213490Sdyson if (error) { 26358345Sphk rbp->b_flags &= ~B_ASYNC; 2641541Srgrimes brelse(rbp); 26513490Sdyson } else if (rbp->b_flags & B_CACHE) { 26658345Sphk rbp->b_flags &= ~B_ASYNC; 26713490Sdyson bqrelse(rbp); 2685455Sdg } else { 26921002Sdyson#if defined(CLUSTERDEBUG) 27021002Sdyson if (rcluster) { 27121002Sdyson if (bp) 27237951Sbde printf("A+(%ld,%ld,%ld,%d) ", 27337951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27437951Sbde (long)(rbp->b_lblkno - origblkno), 27537951Sbde seqcount); 27621002Sdyson else 27737951Sbde printf("A(%ld,%ld,%ld,%d) ", 27837951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27937951Sbde (long)(rbp->b_lblkno - origblkno), 28037951Sbde seqcount); 28121002Sdyson } 28221002Sdyson#endif 28321002Sdyson 28410541Sdyson if ((rbp->b_flags & B_CLUSTER) == 0) 28510541Sdyson vfs_busy_pages(rbp, 0); 28658934Sphk rbp->b_flags &= ~B_INVAL; 28758934Sphk rbp->b_ioflags &= ~BIO_ERROR; 28858345Sphk if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) 28948333Speter BUF_KERNPROC(rbp); 29037384Sjulian (void) VOP_STRATEGY(vp, rbp); 2915455Sdg curproc->p_stats->p_ru.ru_inblock++; 2925455Sdg } 2935455Sdg } 29421002Sdyson if (reqbp) 29521002Sdyson return (biowait(reqbp)); 29621002Sdyson else 29721002Sdyson return (error); 2981541Srgrimes} 2991541Srgrimes 3001541Srgrimes/* 3011541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 3021541Srgrimes * read ahead. We will read as many blocks as possible sequentially 3031541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 3041541Srgrimes */ 30510541Sdysonstatic struct buf * 30621002Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 3071541Srgrimes struct vnode *vp; 3081541Srgrimes u_quad_t filesize; 3091541Srgrimes daddr_t lbn; 3101541Srgrimes daddr_t blkno; 3111541Srgrimes long size; 3121541Srgrimes int run; 31321002Sdyson struct buf *fbp; 3141541Srgrimes{ 31510541Sdyson struct buf *bp, *tbp; 3161541Srgrimes daddr_t bn; 31740648Sphk int i, inc, j; 3181541Srgrimes 31942408Seivind KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 32042453Seivind ("cluster_rbuild: size %ld != filesize %ld\n", 32142453Seivind size, vp->v_mount->mnt_stat.f_iosize)); 32242453Seivind 32312767Sdyson /* 32412767Sdyson * avoid a division 32512767Sdyson */ 32612767Sdyson while ((u_quad_t) size * (lbn + run) > filesize) { 3271541Srgrimes --run; 32812767Sdyson } 32910541Sdyson 33021002Sdyson if (fbp) { 33121002Sdyson tbp = fbp; 33258345Sphk tbp->b_iocmd = BIO_READ; 33321002Sdyson } else { 33421002Sdyson tbp = getblk(vp, lbn, size, 0, 0); 33521002Sdyson if (tbp->b_flags & B_CACHE) 33621002Sdyson return tbp; 33758345Sphk tbp->b_flags |= B_ASYNC | B_RAM; 33858345Sphk tbp->b_iocmd = BIO_READ; 33921002Sdyson } 34010541Sdyson 34110541Sdyson tbp->b_blkno = blkno; 34216086Sdyson if( (tbp->b_flags & B_MALLOC) || 34316086Sdyson ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 34410541Sdyson return tbp; 34510541Sdyson 34642957Sdillon bp = trypbuf(&cluster_pbuf_freecnt); 34710541Sdyson if (bp == 0) 34810541Sdyson return tbp; 34910541Sdyson 35037467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 35137467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 35258345Sphk bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; 35358345Sphk bp->b_iocmd = BIO_READ; 3545455Sdg bp->b_iodone = cluster_callback; 3555455Sdg bp->b_blkno = blkno; 3565455Sdg bp->b_lblkno = lbn; 35734611Sdyson bp->b_offset = tbp->b_offset; 35842453Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 3595455Sdg pbgetvp(vp, bp); 3601541Srgrimes 36112404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 3621541Srgrimes 3635455Sdg bp->b_bcount = 0; 3645455Sdg bp->b_bufsize = 0; 3655455Sdg bp->b_npages = 0; 3665455Sdg 3671541Srgrimes inc = btodb(size); 36810541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3695455Sdg if (i != 0) { 37012767Sdyson if ((bp->b_npages * PAGE_SIZE) + 37151797Sphk round_page(size) > vp->v_mount->mnt_iosize_max) 37210541Sdyson break; 37310978Sdyson 37443301Sdillon if ((tbp = incore(vp, lbn + i)) != NULL) { 37548225Smckusick if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) 37634611Sdyson break; 37748225Smckusick BUF_UNLOCK(tbp); 37812767Sdyson 37934611Sdyson for (j = 0; j < tbp->b_npages; j++) 38034611Sdyson if (tbp->b_pages[j]->valid) 38134611Sdyson break; 38234611Sdyson 38334611Sdyson if (j != tbp->b_npages) 38434611Sdyson break; 38534611Sdyson 38634611Sdyson if (tbp->b_bcount != size) 38734611Sdyson break; 38834611Sdyson } 38934611Sdyson 3905455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 39110541Sdyson 3925455Sdg if ((tbp->b_flags & B_CACHE) || 39310541Sdyson (tbp->b_flags & B_VMIO) == 0) { 39413490Sdyson bqrelse(tbp); 3955455Sdg break; 3965455Sdg } 39710541Sdyson 39834611Sdyson for (j = 0;j < tbp->b_npages; j++) 39934611Sdyson if (tbp->b_pages[j]->valid) 40010541Sdyson break; 40110541Sdyson 40210541Sdyson if (j != tbp->b_npages) { 40334611Sdyson bqrelse(tbp); 40410541Sdyson break; 40510541Sdyson } 40610541Sdyson 40721002Sdyson if ((fbp && (i == 1)) || (i == (run - 1))) 40821002Sdyson tbp->b_flags |= B_RAM; 40958345Sphk tbp->b_flags |= B_ASYNC; 41058345Sphk tbp->b_iocmd = BIO_READ; 41112767Sdyson if (tbp->b_blkno == tbp->b_lblkno) { 41210541Sdyson tbp->b_blkno = bn; 41310541Sdyson } else if (tbp->b_blkno != bn) { 41410541Sdyson brelse(tbp); 41510541Sdyson break; 41610541Sdyson } 4171541Srgrimes } 41848333Speter /* 41948333Speter * XXX fbp from caller may not be B_ASYNC, but we are going 42048333Speter * to biodone() it in cluster_callback() anyway 42148333Speter */ 42248333Speter BUF_KERNPROC(tbp); 42312404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 42412404Sdyson tbp, b_cluster.cluster_entry); 4255455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 42610541Sdyson vm_page_t m; 42710541Sdyson m = tbp->b_pages[j]; 42838799Sdfr vm_page_io_start(m); 42938517Sdfr vm_object_pip_add(m->object, 1); 43010541Sdyson if ((bp->b_npages == 0) || 43112413Sdyson (bp->b_pages[bp->b_npages-1] != m)) { 43210541Sdyson bp->b_pages[bp->b_npages] = m; 43310541Sdyson bp->b_npages++; 43410541Sdyson } 43518737Sdyson if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 43618737Sdyson tbp->b_pages[j] = bogus_page; 4371541Srgrimes } 43810541Sdyson bp->b_bcount += tbp->b_bcount; 43910541Sdyson bp->b_bufsize += tbp->b_bufsize; 4401541Srgrimes } 44118737Sdyson 44218737Sdyson for(j=0;j<bp->b_npages;j++) { 44318737Sdyson if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 44418737Sdyson VM_PAGE_BITS_ALL) 44518737Sdyson bp->b_pages[j] = bogus_page; 44618737Sdyson } 44720054Sdyson if (bp->b_bufsize > bp->b_kvasize) 44837559Sbde panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 44937559Sbde bp->b_bufsize, bp->b_kvasize); 45020054Sdyson bp->b_kvasize = bp->b_bufsize; 45118737Sdyson 45210541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 45310541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 4545455Sdg return (bp); 4551541Srgrimes} 4561541Srgrimes 4571541Srgrimes/* 4581541Srgrimes * Cleanup after a clustered read or write. 4591541Srgrimes * This is complicated by the fact that any of the buffers might have 4601541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 4611541Srgrimes * that we will need to shift around. 4621541Srgrimes */ 4631541Srgrimesvoid 4641541Srgrimescluster_callback(bp) 4651541Srgrimes struct buf *bp; 4661541Srgrimes{ 46712404Sdyson struct buf *nbp, *tbp; 4681541Srgrimes int error = 0; 4691541Srgrimes 4701541Srgrimes /* 4711541Srgrimes * Must propogate errors to all the components. 4721541Srgrimes */ 47358934Sphk if (bp->b_ioflags & BIO_ERROR) 4741541Srgrimes error = bp->b_error; 4751541Srgrimes 47610541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 4771541Srgrimes /* 4781541Srgrimes * Move memory from the large cluster buffer into the component 4791541Srgrimes * buffers and mark IO as done on these. 4801541Srgrimes */ 48121002Sdyson for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 48212404Sdyson tbp; tbp = nbp) { 48321002Sdyson nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 4841541Srgrimes if (error) { 48558934Sphk tbp->b_ioflags |= BIO_ERROR; 4861541Srgrimes tbp->b_error = error; 48746349Salc } else { 48846349Salc tbp->b_dirtyoff = tbp->b_dirtyend = 0; 48958934Sphk tbp->b_flags &= ~B_INVAL; 49058934Sphk tbp->b_ioflags &= ~BIO_ERROR; 49146349Salc } 49259249Sphk bufdone(tbp); 4931541Srgrimes } 49442957Sdillon relpbuf(bp, &cluster_pbuf_freecnt); 4951541Srgrimes} 4961541Srgrimes 4971541Srgrimes/* 49848545Smckusick * cluster_wbuild_wb: 49948545Smckusick * 50048545Smckusick * Implement modified write build for cluster. 50148545Smckusick * 50248545Smckusick * write_behind = 0 write behind disabled 50348545Smckusick * write_behind = 1 write behind normal (default) 50448545Smckusick * write_behind = 2 write behind backed-off 50548545Smckusick */ 50648545Smckusick 50748545Smckusickstatic __inline int 50848545Smckusickcluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) 50948545Smckusick{ 51048545Smckusick int r = 0; 51148545Smckusick 51248545Smckusick switch(write_behind) { 51348545Smckusick case 2: 51448545Smckusick if (start_lbn < len) 51548545Smckusick break; 51648545Smckusick start_lbn -= len; 51748545Smckusick /* fall through */ 51848545Smckusick case 1: 51948545Smckusick r = cluster_wbuild(vp, size, start_lbn, len); 52048545Smckusick /* fall through */ 52148545Smckusick default: 52248545Smckusick /* fall through */ 52348545Smckusick break; 52448545Smckusick } 52548545Smckusick return(r); 52648545Smckusick} 52748545Smckusick 52848545Smckusick/* 5291541Srgrimes * Do clustered write for FFS. 5301541Srgrimes * 5311541Srgrimes * Three cases: 5321541Srgrimes * 1. Write is not sequential (write asynchronously) 5331541Srgrimes * Write is sequential: 5341541Srgrimes * 2. beginning of cluster - begin cluster 5351541Srgrimes * 3. middle of a cluster - add to cluster 5361541Srgrimes * 4. end of a cluster - asynchronously write cluster 5371541Srgrimes */ 5381541Srgrimesvoid 53958909Sdilloncluster_write(bp, filesize, seqcount) 5405455Sdg struct buf *bp; 5411541Srgrimes u_quad_t filesize; 54258909Sdillon int seqcount; 5431541Srgrimes{ 5445455Sdg struct vnode *vp; 5455455Sdg daddr_t lbn; 5465455Sdg int maxclen, cursize; 5475455Sdg int lblocksize; 54812404Sdyson int async; 5491541Srgrimes 5505455Sdg vp = bp->b_vp; 55132286Sdyson if (vp->v_type == VREG) { 55232286Sdyson async = vp->v_mount->mnt_flag & MNT_ASYNC; 55332286Sdyson lblocksize = vp->v_mount->mnt_stat.f_iosize; 55432286Sdyson } else { 55532286Sdyson async = 0; 55632286Sdyson lblocksize = bp->b_bufsize; 55732286Sdyson } 5585455Sdg lbn = bp->b_lblkno; 55942408Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 56034694Sdyson 5611541Srgrimes /* Initialize vnode to beginning of file. */ 5621541Srgrimes if (lbn == 0) 5631541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 5641541Srgrimes 5655455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 5665455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 56751797Sphk maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; 5681541Srgrimes if (vp->v_clen != 0) { 5691541Srgrimes /* 5701541Srgrimes * Next block is not sequential. 5718876Srgrimes * 5721541Srgrimes * If we are not writing at end of file, the process 5735455Sdg * seeked to another point in the file since its last 5745455Sdg * write, or we have reached our maximum cluster size, 5755455Sdg * then push the previous cluster. Otherwise try 5765455Sdg * reallocating to make it sequential. 57758909Sdillon * 57858909Sdillon * Change to algorithm: only push previous cluster if 57958909Sdillon * it was sequential from the point of view of the 58058909Sdillon * seqcount heuristic, otherwise leave the buffer 58158909Sdillon * intact so we can potentially optimize the I/O 58258909Sdillon * later on in the buf_daemon or update daemon 58358909Sdillon * flush. 5841541Srgrimes */ 5851541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 58634611Sdyson if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 58710541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 58858909Sdillon if (!async && seqcount > 0) { 58948677Smckusick cluster_wbuild_wb(vp, lblocksize, 59012404Sdyson vp->v_cstart, cursize); 59158909Sdillon } 59210541Sdyson } else { 59310541Sdyson struct buf **bpp, **endbp; 59410541Sdyson struct cluster_save *buflist; 59510541Sdyson 59610541Sdyson buflist = cluster_collectbufs(vp, bp); 59710541Sdyson endbp = &buflist->bs_children 59810541Sdyson [buflist->bs_nchildren - 1]; 59910541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 60010541Sdyson /* 60158909Sdillon * Failed, push the previous cluster 60258909Sdillon * if *really* writing sequentially 60358909Sdillon * in the logical file (seqcount > 1), 60458909Sdillon * otherwise delay it in the hopes that 60558909Sdillon * the low level disk driver can 60658909Sdillon * optimize the write ordering. 60710541Sdyson */ 60810541Sdyson for (bpp = buflist->bs_children; 60910541Sdyson bpp < endbp; bpp++) 61010541Sdyson brelse(*bpp); 61110541Sdyson free(buflist, M_SEGMENT); 61258909Sdillon if (seqcount > 1) { 61358909Sdillon cluster_wbuild_wb(vp, 61458909Sdillon lblocksize, vp->v_cstart, 61558909Sdillon cursize); 61658909Sdillon } 61710541Sdyson } else { 61810541Sdyson /* 61910541Sdyson * Succeeded, keep building cluster. 62010541Sdyson */ 62110541Sdyson for (bpp = buflist->bs_children; 62210541Sdyson bpp <= endbp; bpp++) 62310541Sdyson bdwrite(*bpp); 62410541Sdyson free(buflist, M_SEGMENT); 62510541Sdyson vp->v_lastw = lbn; 62610541Sdyson vp->v_lasta = bp->b_blkno; 62710541Sdyson return; 62810541Sdyson } 62910541Sdyson } 6301541Srgrimes } 6311541Srgrimes /* 6325455Sdg * Consider beginning a cluster. If at end of file, make 6335455Sdg * cluster as large as possible, otherwise find size of 6345455Sdg * existing cluster. 6351541Srgrimes */ 63632286Sdyson if ((vp->v_type == VREG) && 63734611Sdyson ((u_quad_t) bp->b_offset + lblocksize) != filesize && 6387613Sdg (bp->b_blkno == bp->b_lblkno) && 63910551Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 64010541Sdyson bp->b_blkno == -1)) { 6411541Srgrimes bawrite(bp); 6421541Srgrimes vp->v_clen = 0; 6431541Srgrimes vp->v_lasta = bp->b_blkno; 6441541Srgrimes vp->v_cstart = lbn + 1; 6451541Srgrimes vp->v_lastw = lbn; 6461541Srgrimes return; 6471541Srgrimes } 6485455Sdg vp->v_clen = maxclen; 64912404Sdyson if (!async && maxclen == 0) { /* I/O not contiguous */ 6501541Srgrimes vp->v_cstart = lbn + 1; 65113490Sdyson bawrite(bp); 6525455Sdg } else { /* Wait for rest of cluster */ 6531541Srgrimes vp->v_cstart = lbn; 6545455Sdg bdwrite(bp); 6551541Srgrimes } 6561541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 6571541Srgrimes /* 65858909Sdillon * At end of cluster, write it out if seqcount tells us we 65958909Sdillon * are operating sequentially, otherwise let the buf or 66058909Sdillon * update daemon handle it. 6611541Srgrimes */ 66212404Sdyson bdwrite(bp); 66358909Sdillon if (seqcount > 1) 66458909Sdillon cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 6651541Srgrimes vp->v_clen = 0; 6661541Srgrimes vp->v_cstart = lbn + 1; 66758909Sdillon } else { 6681541Srgrimes /* 6695455Sdg * In the middle of a cluster, so just delay the I/O for now. 6701541Srgrimes */ 6711541Srgrimes bdwrite(bp); 67258909Sdillon } 6731541Srgrimes vp->v_lastw = lbn; 6741541Srgrimes vp->v_lasta = bp->b_blkno; 6751541Srgrimes} 6761541Srgrimes 6771541Srgrimes 6781541Srgrimes/* 6791541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 6801541Srgrimes * The last lbn argument is the current block on which I/O is being 6811541Srgrimes * performed. Check to see that it doesn't fall in the middle of 6821541Srgrimes * the current block (if last_bp == NULL). 6831541Srgrimes */ 68412767Sdysonint 68512404Sdysoncluster_wbuild(vp, size, start_lbn, len) 6861541Srgrimes struct vnode *vp; 6871541Srgrimes long size; 6881541Srgrimes daddr_t start_lbn; 6891541Srgrimes int len; 6901541Srgrimes{ 69112404Sdyson struct buf *bp, *tbp; 6925455Sdg int i, j, s; 69312767Sdyson int totalwritten = 0; 69412404Sdyson int dbsize = btodb(size); 69535595Sbde 69612767Sdyson while (len > 0) { 69712767Sdyson s = splbio(); 69832286Sdyson if (((tbp = gbincore(vp, start_lbn)) == NULL) || 69948225Smckusick ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) || 70048225Smckusick BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { 70112767Sdyson ++start_lbn; 70212767Sdyson --len; 70312767Sdyson splx(s); 70412767Sdyson continue; 70512767Sdyson } 70612767Sdyson bremfree(tbp); 70712767Sdyson tbp->b_flags &= ~B_DONE; 70812767Sdyson splx(s); 7091541Srgrimes 71047967Sjulian /* 71147967Sjulian * Extra memory in the buffer, punt on this buffer. 71247967Sjulian * XXX we could handle this in most cases, but we would 71347967Sjulian * have to push the extra memory down to after our max 71447967Sjulian * possible cluster size and then potentially pull it back 71547967Sjulian * up if the cluster was terminated prematurely--too much 71647967Sjulian * hassle. 71747967Sjulian */ 71814319Sdyson if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 71934630Sjulian (tbp->b_bcount != tbp->b_bufsize) || 72034630Sjulian (tbp->b_bcount != size) || 72134630Sjulian (len == 1) || 72247948Sdg ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { 72312767Sdyson totalwritten += tbp->b_bufsize; 72412767Sdyson bawrite(tbp); 72512767Sdyson ++start_lbn; 72612767Sdyson --len; 72712767Sdyson continue; 72812767Sdyson } 72912404Sdyson 73034630Sjulian /* 73134630Sjulian * We got a pbuf to make the cluster in. 73234630Sjulian * so initialise it. 73334630Sjulian */ 73412767Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 73512767Sdyson bp->b_bcount = 0; 73612767Sdyson bp->b_bufsize = 0; 73712767Sdyson bp->b_npages = 0; 73817304Sdyson if (tbp->b_wcred != NOCRED) { 73917304Sdyson bp->b_wcred = tbp->b_wcred; 74017304Sdyson crhold(bp->b_wcred); 74117304Sdyson } 7421541Srgrimes 74312767Sdyson bp->b_blkno = tbp->b_blkno; 74412767Sdyson bp->b_lblkno = tbp->b_lblkno; 74534611Sdyson bp->b_offset = tbp->b_offset; 74637467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 74737467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 74858345Sphk bp->b_flags |= B_CLUSTER | 74934694Sdyson (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 75012767Sdyson bp->b_iodone = cluster_callback; 75112767Sdyson pbgetvp(vp, bp); 75234630Sjulian /* 75334630Sjulian * From this location in the file, scan forward to see 75434630Sjulian * if there are buffers with adjacent data that need to 75534630Sjulian * be written as well. 75634630Sjulian */ 75712767Sdyson for (i = 0; i < len; ++i, ++start_lbn) { 75834630Sjulian if (i != 0) { /* If not the first buffer */ 75912767Sdyson s = splbio(); 76034630Sjulian /* 76134630Sjulian * If the adjacent data is not even in core it 76234630Sjulian * can't need to be written. 76334630Sjulian */ 76412767Sdyson if ((tbp = gbincore(vp, start_lbn)) == NULL) { 76512767Sdyson splx(s); 76612767Sdyson break; 76712767Sdyson } 7681541Srgrimes 76934630Sjulian /* 77034630Sjulian * If it IS in core, but has different 77134630Sjulian * characteristics, don't cluster with it. 77234630Sjulian */ 77348225Smckusick if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 77448225Smckusick B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 77534694Sdyson != (B_DELWRI | B_CLUSTEROK | 77648225Smckusick (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 77748225Smckusick tbp->b_wcred != bp->b_wcred || 77848225Smckusick BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { 77912767Sdyson splx(s); 78012767Sdyson break; 78112767Sdyson } 78212767Sdyson 78334630Sjulian /* 78434630Sjulian * Check that the combined cluster 78534630Sjulian * would make sense with regard to pages 78634630Sjulian * and would not be too large 78734630Sjulian */ 78812767Sdyson if ((tbp->b_bcount != size) || 78934630Sjulian ((bp->b_blkno + (dbsize * i)) != 79034694Sdyson tbp->b_blkno) || 79134630Sjulian ((tbp->b_npages + bp->b_npages) > 79251797Sphk (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { 79348225Smckusick BUF_UNLOCK(tbp); 79412767Sdyson splx(s); 79512767Sdyson break; 79612767Sdyson } 79734630Sjulian /* 79834630Sjulian * Ok, it's passed all the tests, 79934630Sjulian * so remove it from the free list 80034630Sjulian * and mark it busy. We will use it. 80134630Sjulian */ 80212767Sdyson bremfree(tbp); 80312767Sdyson tbp->b_flags &= ~B_DONE; 80412404Sdyson splx(s); 80534630Sjulian } /* end of code for non-first buffers only */ 80634266Sjulian /* check for latent dependencies to be handled */ 80734266Sjulian if ((LIST_FIRST(&tbp->b_dep)) != NULL && 80834266Sjulian bioops.io_start) 80934266Sjulian (*bioops.io_start)(tbp); 81034630Sjulian /* 81134630Sjulian * If the IO is via the VM then we do some 81234630Sjulian * special VM hackery. (yuck) 81334630Sjulian */ 81413490Sdyson if (tbp->b_flags & B_VMIO) { 81532937Sdyson vm_page_t m; 81632937Sdyson 81734630Sjulian if (i != 0) { /* if not first buffer */ 81832937Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 81932937Sdyson m = tbp->b_pages[j]; 82050701Stegge if (m->flags & PG_BUSY) { 82150701Stegge bqrelse(tbp); 82232937Sdyson goto finishcluster; 82350701Stegge } 82432937Sdyson } 82532937Sdyson } 82632937Sdyson 82713490Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 82813490Sdyson m = tbp->b_pages[j]; 82938799Sdfr vm_page_io_start(m); 83038517Sdfr vm_object_pip_add(m->object, 1); 83113490Sdyson if ((bp->b_npages == 0) || 83234630Sjulian (bp->b_pages[bp->b_npages - 1] != m)) { 83313490Sdyson bp->b_pages[bp->b_npages] = m; 83413490Sdyson bp->b_npages++; 83513490Sdyson } 83612767Sdyson } 83712767Sdyson } 83812767Sdyson bp->b_bcount += size; 83912767Sdyson bp->b_bufsize += size; 8401541Srgrimes 84138299Sdfr s = splbio(); 84244679Sjulian bundirty(tbp); 84358934Sphk tbp->b_flags &= ~B_DONE; 84458934Sphk tbp->b_ioflags &= ~BIO_ERROR; 84512767Sdyson tbp->b_flags |= B_ASYNC; 84658345Sphk tbp->b_iocmd = BIO_WRITE; 84712767Sdyson reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 84812767Sdyson ++tbp->b_vp->v_numoutput; 84938299Sdfr splx(s); 85048333Speter BUF_KERNPROC(tbp); 85112767Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 85212767Sdyson tbp, b_cluster.cluster_entry); 8531541Srgrimes } 85432937Sdyson finishcluster: 85512767Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 85612767Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 85720054Sdyson if (bp->b_bufsize > bp->b_kvasize) 85837559Sbde panic( 85937559Sbde "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 86037559Sbde bp->b_bufsize, bp->b_kvasize); 86120054Sdyson bp->b_kvasize = bp->b_bufsize; 86212767Sdyson totalwritten += bp->b_bufsize; 86317304Sdyson bp->b_dirtyoff = 0; 86417304Sdyson bp->b_dirtyend = bp->b_bufsize; 86512767Sdyson bawrite(bp); 8661541Srgrimes 86712767Sdyson len -= i; 8681541Srgrimes } 86912767Sdyson return totalwritten; 8701541Srgrimes} 8711541Srgrimes 8721541Srgrimes/* 8731541Srgrimes * Collect together all the buffers in a cluster. 8741541Srgrimes * Plus add one additional buffer. 8751541Srgrimes */ 87612973Sbdestatic struct cluster_save * 8771541Srgrimescluster_collectbufs(vp, last_bp) 8781541Srgrimes struct vnode *vp; 8791541Srgrimes struct buf *last_bp; 8801541Srgrimes{ 8811541Srgrimes struct cluster_save *buflist; 88241205Smckusick struct buf *bp; 8835455Sdg daddr_t lbn; 8841541Srgrimes int i, len; 8851541Srgrimes 8861541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 8871541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 8881541Srgrimes M_SEGMENT, M_WAITOK); 8891541Srgrimes buflist->bs_nchildren = 0; 8905455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 89141205Smckusick for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 89241205Smckusick (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 89341205Smckusick buflist->bs_children[i] = bp; 89441205Smckusick if (bp->b_blkno == bp->b_lblkno) 89541205Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 89641205Smckusick NULL, NULL); 89741205Smckusick } 89841529Smckusick buflist->bs_children[i] = bp = last_bp; 89941529Smckusick if (bp->b_blkno == bp->b_lblkno) 90041529Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 90141529Smckusick NULL, NULL); 9021541Srgrimes buflist->bs_nchildren = i + 1; 9031541Srgrimes return (buflist); 9041541Srgrimes} 905