vfs_cluster.c revision 44679
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 3644679Sjulian * $Id: vfs_cluster.c,v 1.79 1999/01/27 21:49:58 dillon Exp $ 371541Srgrimes */ 381541Srgrimes 3932929Seivind#include "opt_debug_cluster.h" 4032929Seivind 411541Srgrimes#include <sys/param.h> 421549Srgrimes#include <sys/systm.h> 4341168Sbde#include <sys/kernel.h> 441541Srgrimes#include <sys/proc.h> 451541Srgrimes#include <sys/buf.h> 461541Srgrimes#include <sys/vnode.h> 4741124Sdg#include <sys/malloc.h> 481541Srgrimes#include <sys/mount.h> 491541Srgrimes#include <sys/resourcevar.h> 506621Sdg#include <vm/vm.h> 5112662Sdg#include <vm/vm_prot.h> 5210541Sdyson#include <vm/vm_object.h> 5310541Sdyson#include <vm/vm_page.h> 541541Srgrimes 5521002Sdyson#if defined(CLUSTERDEBUG) 5621002Sdyson#include <sys/sysctl.h> 5721002Sdysonstatic int rcluster= 0; 5824484SbdeSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 5921002Sdyson#endif 6021002Sdyson 6141124Sdgstatic MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 6241124Sdg 6312973Sbdestatic struct cluster_save * 6412973Sbde cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 6512973Sbdestatic struct buf * 6612973Sbde cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 6721002Sdyson daddr_t blkno, long size, int run, struct buf *fbp)); 681541Srgrimes 6912973Sbdeextern vm_page_t bogus_page; 705455Sdg 7142957Sdillonextern int cluster_pbuf_freecnt; 7242957Sdillon 731541Srgrimes/* 7421002Sdyson * Maximum number of blocks for read-ahead. 751541Srgrimes */ 7621002Sdyson#define MAXRA 32 775455Sdg 781541Srgrimes/* 7921002Sdyson * This replaces bread. 8010541Sdyson */ 811549Srgrimesint 8221002Sdysoncluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 831541Srgrimes struct vnode *vp; 841541Srgrimes u_quad_t filesize; 851541Srgrimes daddr_t lblkno; 861541Srgrimes long size; 871541Srgrimes struct ucred *cred; 8821002Sdyson long totread; 8921002Sdyson int seqcount; 901541Srgrimes struct buf **bpp; 911541Srgrimes{ 9221002Sdyson struct buf *bp, *rbp, *reqbp; 9331016Sphk daddr_t blkno, origblkno; 9421002Sdyson int error, num_ra; 9510541Sdyson int i; 9621002Sdyson int maxra, racluster; 9721002Sdyson long origtotread; 981541Srgrimes 991541Srgrimes error = 0; 10032724Sdyson if (vp->v_maxio == 0) 10132724Sdyson vp->v_maxio = DFLTPHYS; 10221002Sdyson 1035455Sdg /* 10421002Sdyson * Try to limit the amount of read-ahead by a few 10521002Sdyson * ad-hoc parameters. This needs work!!! 10621002Sdyson */ 10732724Sdyson racluster = vp->v_maxio/size; 10821002Sdyson maxra = 2 * racluster + (totread / size); 10921002Sdyson if (maxra > MAXRA) 11021002Sdyson maxra = MAXRA; 11121002Sdyson if (maxra > nbuf/8) 11221002Sdyson maxra = nbuf/8; 11321002Sdyson 11421002Sdyson /* 1155455Sdg * get the requested block 1165455Sdg */ 11721002Sdyson *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 11821002Sdyson origblkno = lblkno; 11921002Sdyson origtotread = totread; 12012767Sdyson 1215455Sdg /* 1225455Sdg * if it is in the cache, then check to see if the reads have been 1235455Sdg * sequential. If they have, then try some read-ahead, otherwise 1245455Sdg * back-off on prospective read-aheads. 1255455Sdg */ 1261541Srgrimes if (bp->b_flags & B_CACHE) { 12721002Sdyson if (!seqcount) { 1285455Sdg return 0; 12921002Sdyson } else if ((bp->b_flags & B_RAM) == 0) { 13021002Sdyson return 0; 13121002Sdyson } else { 13221002Sdyson int s; 13321002Sdyson struct buf *tbp; 13421002Sdyson bp->b_flags &= ~B_RAM; 13521002Sdyson /* 13621002Sdyson * We do the spl here so that there is no window 13721002Sdyson * between the incore and the b_usecount increment 13821002Sdyson * below. We opt to keep the spl out of the loop 13921002Sdyson * for efficiency. 14021002Sdyson */ 14121002Sdyson s = splbio(); 14221002Sdyson for(i=1;i<maxra;i++) { 14321002Sdyson 14421002Sdyson if (!(tbp = incore(vp, lblkno+i))) { 14521002Sdyson break; 14621002Sdyson } 14721002Sdyson 14821002Sdyson /* 14921002Sdyson * Set another read-ahead mark so we know to check 15021002Sdyson * again. 15121002Sdyson */ 15221002Sdyson if (((i % racluster) == (racluster - 1)) || 15321002Sdyson (i == (maxra - 1))) 15421002Sdyson tbp->b_flags |= B_RAM; 15521002Sdyson 15634694Sdyson if ((tbp->b_usecount < 1) && 15734206Sdyson ((tbp->b_flags & B_BUSY) == 0) && 15834206Sdyson (tbp->b_qindex == QUEUE_LRU)) { 15934206Sdyson TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist); 16034206Sdyson TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist); 16121002Sdyson } 16221002Sdyson } 16321002Sdyson splx(s); 16421002Sdyson if (i >= maxra) { 1655839Sdg return 0; 16610541Sdyson } 16721002Sdyson lblkno += i; 16821002Sdyson } 16921002Sdyson reqbp = bp = NULL; 17021002Sdyson } else { 17142453Seivind off_t firstread = bp->b_offset; 17242453Seivind 17342408Seivind KASSERT(bp->b_offset != NOOFFSET, 17442453Seivind ("cluster_read: no buffer offset")); 17521002Sdyson if (firstread + totread > filesize) 17621002Sdyson totread = filesize - firstread; 17721002Sdyson if (totread > size) { 17821002Sdyson int nblks = 0; 17921002Sdyson int ncontigafter; 18021002Sdyson while (totread > 0) { 18121002Sdyson nblks++; 18221002Sdyson totread -= size; 18321002Sdyson } 18421002Sdyson if (nblks == 1) 18521002Sdyson goto single_block_read; 18621002Sdyson if (nblks > racluster) 18721002Sdyson nblks = racluster; 18821002Sdyson 18921002Sdyson error = VOP_BMAP(vp, lblkno, NULL, 19021002Sdyson &blkno, &ncontigafter, NULL); 19121002Sdyson if (error) 19221002Sdyson goto single_block_read; 19321002Sdyson if (blkno == -1) 19421002Sdyson goto single_block_read; 19521002Sdyson if (ncontigafter == 0) 19621002Sdyson goto single_block_read; 19721002Sdyson if (ncontigafter + 1 < nblks) 19821002Sdyson nblks = ncontigafter + 1; 19921002Sdyson 20021002Sdyson bp = cluster_rbuild(vp, filesize, lblkno, 20121002Sdyson blkno, size, nblks, bp); 20234694Sdyson lblkno += (bp->b_bufsize / size); 20310541Sdyson } else { 20421002Sdysonsingle_block_read: 20521002Sdyson /* 20621002Sdyson * if it isn't in the cache, then get a chunk from 20721002Sdyson * disk if sequential, otherwise just get the block. 20821002Sdyson */ 20921002Sdyson bp->b_flags |= B_READ | B_RAM; 21010541Sdyson lblkno += 1; 2118876Srgrimes } 2121541Srgrimes } 2135455Sdg 2145455Sdg /* 2155455Sdg * if we have been doing sequential I/O, then do some read-ahead 2165455Sdg */ 21721002Sdyson rbp = NULL; 21821002Sdyson if (seqcount && (lblkno < (origblkno + seqcount))) { 2191541Srgrimes /* 22021002Sdyson * we now build the read-ahead buffer if it is desirable. 2211541Srgrimes */ 22221002Sdyson if (((u_quad_t)(lblkno + 1) * size) <= filesize && 22321002Sdyson !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 22421002Sdyson blkno != -1) { 22521002Sdyson int nblksread; 22621002Sdyson int ntoread = num_ra + 1; 22721002Sdyson nblksread = (origtotread + size - 1) / size; 22821002Sdyson if (seqcount < nblksread) 22921002Sdyson seqcount = nblksread; 23021002Sdyson if (seqcount < ntoread) 23121002Sdyson ntoread = seqcount; 23221002Sdyson if (num_ra) { 23321002Sdyson rbp = cluster_rbuild(vp, filesize, lblkno, 23421002Sdyson blkno, size, ntoread, NULL); 23521002Sdyson } else { 23621002Sdyson rbp = getblk(vp, lblkno, size, 0, 0); 23721002Sdyson rbp->b_flags |= B_READ | B_ASYNC | B_RAM; 23821002Sdyson rbp->b_blkno = blkno; 2395455Sdg } 2401541Srgrimes } 2415455Sdg } 2421541Srgrimes 2435455Sdg /* 24410541Sdyson * handle the synchronous read 2455455Sdg */ 2465455Sdg if (bp) { 24721002Sdyson#if defined(CLUSTERDEBUG) 24836275Sdyson if (rcluster) 24937951Sbde printf("S(%ld,%ld,%d) ", 25037951Sbde (long)bp->b_lblkno, bp->b_bcount, seqcount); 25121002Sdyson#endif 25236275Sdyson if ((bp->b_flags & B_CLUSTER) == 0) 25336275Sdyson vfs_busy_pages(bp, 0); 25437384Sjulian error = VOP_STRATEGY(vp, bp); 25536275Sdyson curproc->p_stats->p_ru.ru_inblock++; 2565455Sdg } 25734611Sdyson 2585455Sdg /* 2595455Sdg * and if we have read-aheads, do them too 2605455Sdg */ 2615455Sdg if (rbp) { 26213490Sdyson if (error) { 2631541Srgrimes rbp->b_flags &= ~(B_ASYNC | B_READ); 2641541Srgrimes brelse(rbp); 26513490Sdyson } else if (rbp->b_flags & B_CACHE) { 26613490Sdyson rbp->b_flags &= ~(B_ASYNC | B_READ); 26713490Sdyson bqrelse(rbp); 2685455Sdg } else { 26921002Sdyson#if defined(CLUSTERDEBUG) 27021002Sdyson if (rcluster) { 27121002Sdyson if (bp) 27237951Sbde printf("A+(%ld,%ld,%ld,%d) ", 27337951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27437951Sbde (long)(rbp->b_lblkno - origblkno), 27537951Sbde seqcount); 27621002Sdyson else 27737951Sbde printf("A(%ld,%ld,%ld,%d) ", 27837951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27937951Sbde (long)(rbp->b_lblkno - origblkno), 28037951Sbde seqcount); 28121002Sdyson } 28221002Sdyson#endif 28321002Sdyson 28410541Sdyson if ((rbp->b_flags & B_CLUSTER) == 0) 28510541Sdyson vfs_busy_pages(rbp, 0); 28637384Sjulian (void) VOP_STRATEGY(vp, rbp); 2875455Sdg curproc->p_stats->p_ru.ru_inblock++; 2885455Sdg } 2895455Sdg } 29021002Sdyson if (reqbp) 29121002Sdyson return (biowait(reqbp)); 29221002Sdyson else 29321002Sdyson return (error); 2941541Srgrimes} 2951541Srgrimes 2961541Srgrimes/* 2971541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 2981541Srgrimes * read ahead. We will read as many blocks as possible sequentially 2991541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 3001541Srgrimes */ 30110541Sdysonstatic struct buf * 30221002Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 3031541Srgrimes struct vnode *vp; 3041541Srgrimes u_quad_t filesize; 3051541Srgrimes daddr_t lbn; 3061541Srgrimes daddr_t blkno; 3071541Srgrimes long size; 3081541Srgrimes int run; 30921002Sdyson struct buf *fbp; 3101541Srgrimes{ 31110541Sdyson struct buf *bp, *tbp; 3121541Srgrimes daddr_t bn; 31340648Sphk int i, inc, j; 3141541Srgrimes 31542408Seivind KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 31642453Seivind ("cluster_rbuild: size %ld != filesize %ld\n", 31742453Seivind size, vp->v_mount->mnt_stat.f_iosize)); 31842453Seivind 31912767Sdyson /* 32012767Sdyson * avoid a division 32112767Sdyson */ 32212767Sdyson while ((u_quad_t) size * (lbn + run) > filesize) { 3231541Srgrimes --run; 32412767Sdyson } 32510541Sdyson 32621002Sdyson if (fbp) { 32721002Sdyson tbp = fbp; 32821002Sdyson tbp->b_flags |= B_READ; 32921002Sdyson } else { 33021002Sdyson tbp = getblk(vp, lbn, size, 0, 0); 33121002Sdyson if (tbp->b_flags & B_CACHE) 33221002Sdyson return tbp; 33321002Sdyson tbp->b_flags |= B_ASYNC | B_READ | B_RAM; 33421002Sdyson } 33510541Sdyson 33610541Sdyson tbp->b_blkno = blkno; 33716086Sdyson if( (tbp->b_flags & B_MALLOC) || 33816086Sdyson ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 33910541Sdyson return tbp; 34010541Sdyson 34142957Sdillon bp = trypbuf(&cluster_pbuf_freecnt); 34210541Sdyson if (bp == 0) 34310541Sdyson return tbp; 34410541Sdyson 34537467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 34637467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 34710541Sdyson bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 3485455Sdg bp->b_iodone = cluster_callback; 3495455Sdg bp->b_blkno = blkno; 3505455Sdg bp->b_lblkno = lbn; 35134611Sdyson bp->b_offset = tbp->b_offset; 35242453Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 3535455Sdg pbgetvp(vp, bp); 3541541Srgrimes 35512404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 3561541Srgrimes 3575455Sdg bp->b_bcount = 0; 3585455Sdg bp->b_bufsize = 0; 3595455Sdg bp->b_npages = 0; 3605455Sdg 36132724Sdyson if (vp->v_maxio == 0) 36232724Sdyson vp->v_maxio = DFLTPHYS; 3631541Srgrimes inc = btodb(size); 36410541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3655455Sdg if (i != 0) { 36612767Sdyson if ((bp->b_npages * PAGE_SIZE) + 36732724Sdyson round_page(size) > vp->v_maxio) 36810541Sdyson break; 36910978Sdyson 37043301Sdillon if ((tbp = incore(vp, lbn + i)) != NULL) { 37134611Sdyson if (tbp->b_flags & B_BUSY) 37234611Sdyson break; 37312767Sdyson 37434611Sdyson for (j = 0; j < tbp->b_npages; j++) 37534611Sdyson if (tbp->b_pages[j]->valid) 37634611Sdyson break; 37734611Sdyson 37834611Sdyson if (j != tbp->b_npages) 37934611Sdyson break; 38034611Sdyson 38134611Sdyson if (tbp->b_bcount != size) 38234611Sdyson break; 38334611Sdyson } 38434611Sdyson 3855455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 38610541Sdyson 3875455Sdg if ((tbp->b_flags & B_CACHE) || 38810541Sdyson (tbp->b_flags & B_VMIO) == 0) { 38913490Sdyson bqrelse(tbp); 3905455Sdg break; 3915455Sdg } 39210541Sdyson 39334611Sdyson for (j = 0;j < tbp->b_npages; j++) 39434611Sdyson if (tbp->b_pages[j]->valid) 39510541Sdyson break; 39610541Sdyson 39710541Sdyson if (j != tbp->b_npages) { 39834611Sdyson bqrelse(tbp); 39910541Sdyson break; 40010541Sdyson } 40110541Sdyson 40221002Sdyson if ((fbp && (i == 1)) || (i == (run - 1))) 40321002Sdyson tbp->b_flags |= B_RAM; 40410541Sdyson tbp->b_flags |= B_READ | B_ASYNC; 40512767Sdyson if (tbp->b_blkno == tbp->b_lblkno) { 40610541Sdyson tbp->b_blkno = bn; 40710541Sdyson } else if (tbp->b_blkno != bn) { 40810541Sdyson brelse(tbp); 40910541Sdyson break; 41010541Sdyson } 4111541Srgrimes } 41212404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 41312404Sdyson tbp, b_cluster.cluster_entry); 4145455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 41510541Sdyson vm_page_t m; 41610541Sdyson m = tbp->b_pages[j]; 41738799Sdfr vm_page_io_start(m); 41838517Sdfr vm_object_pip_add(m->object, 1); 41910541Sdyson if ((bp->b_npages == 0) || 42012413Sdyson (bp->b_pages[bp->b_npages-1] != m)) { 42110541Sdyson bp->b_pages[bp->b_npages] = m; 42210541Sdyson bp->b_npages++; 42310541Sdyson } 42418737Sdyson if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 42518737Sdyson tbp->b_pages[j] = bogus_page; 4261541Srgrimes } 42710541Sdyson bp->b_bcount += tbp->b_bcount; 42810541Sdyson bp->b_bufsize += tbp->b_bufsize; 4291541Srgrimes } 43018737Sdyson 43118737Sdyson for(j=0;j<bp->b_npages;j++) { 43218737Sdyson if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 43318737Sdyson VM_PAGE_BITS_ALL) 43418737Sdyson bp->b_pages[j] = bogus_page; 43518737Sdyson } 43620054Sdyson if (bp->b_bufsize > bp->b_kvasize) 43737559Sbde panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 43837559Sbde bp->b_bufsize, bp->b_kvasize); 43920054Sdyson bp->b_kvasize = bp->b_bufsize; 44018737Sdyson 44110541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 44210541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 4435455Sdg return (bp); 4441541Srgrimes} 4451541Srgrimes 4461541Srgrimes/* 4471541Srgrimes * Cleanup after a clustered read or write. 4481541Srgrimes * This is complicated by the fact that any of the buffers might have 4491541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 4501541Srgrimes * that we will need to shift around. 4511541Srgrimes */ 4521541Srgrimesvoid 4531541Srgrimescluster_callback(bp) 4541541Srgrimes struct buf *bp; 4551541Srgrimes{ 45612404Sdyson struct buf *nbp, *tbp; 4571541Srgrimes int error = 0; 4581541Srgrimes 4591541Srgrimes /* 4601541Srgrimes * Must propogate errors to all the components. 4611541Srgrimes */ 4621541Srgrimes if (bp->b_flags & B_ERROR) 4631541Srgrimes error = bp->b_error; 4641541Srgrimes 46510541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 4661541Srgrimes /* 4671541Srgrimes * Move memory from the large cluster buffer into the component 4681541Srgrimes * buffers and mark IO as done on these. 4691541Srgrimes */ 47021002Sdyson for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 47112404Sdyson tbp; tbp = nbp) { 47221002Sdyson nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 4731541Srgrimes if (error) { 4741541Srgrimes tbp->b_flags |= B_ERROR; 4751541Srgrimes tbp->b_error = error; 47625135Sdfr } else 47725135Sdfr tbp->b_dirtyoff = tbp->b_dirtyend = 0; 4781541Srgrimes biodone(tbp); 4791541Srgrimes } 48042957Sdillon relpbuf(bp, &cluster_pbuf_freecnt); 4811541Srgrimes} 4821541Srgrimes 4831541Srgrimes/* 4841541Srgrimes * Do clustered write for FFS. 4851541Srgrimes * 4861541Srgrimes * Three cases: 4871541Srgrimes * 1. Write is not sequential (write asynchronously) 4881541Srgrimes * Write is sequential: 4891541Srgrimes * 2. beginning of cluster - begin cluster 4901541Srgrimes * 3. middle of a cluster - add to cluster 4911541Srgrimes * 4. end of a cluster - asynchronously write cluster 4921541Srgrimes */ 4931541Srgrimesvoid 4941541Srgrimescluster_write(bp, filesize) 4955455Sdg struct buf *bp; 4961541Srgrimes u_quad_t filesize; 4971541Srgrimes{ 4985455Sdg struct vnode *vp; 4995455Sdg daddr_t lbn; 5005455Sdg int maxclen, cursize; 5015455Sdg int lblocksize; 50212404Sdyson int async; 5031541Srgrimes 5045455Sdg vp = bp->b_vp; 50532724Sdyson if (vp->v_maxio == 0) 50632724Sdyson vp->v_maxio = DFLTPHYS; 50732286Sdyson if (vp->v_type == VREG) { 50832286Sdyson async = vp->v_mount->mnt_flag & MNT_ASYNC; 50932286Sdyson lblocksize = vp->v_mount->mnt_stat.f_iosize; 51032286Sdyson } else { 51132286Sdyson async = 0; 51232286Sdyson lblocksize = bp->b_bufsize; 51332286Sdyson } 5145455Sdg lbn = bp->b_lblkno; 51542408Seivind KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 51634694Sdyson 5171541Srgrimes /* Initialize vnode to beginning of file. */ 5181541Srgrimes if (lbn == 0) 5191541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 5201541Srgrimes 5215455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 5225455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 52332724Sdyson maxclen = vp->v_maxio / lblocksize - 1; 5241541Srgrimes if (vp->v_clen != 0) { 5251541Srgrimes /* 5261541Srgrimes * Next block is not sequential. 5278876Srgrimes * 5281541Srgrimes * If we are not writing at end of file, the process 5295455Sdg * seeked to another point in the file since its last 5305455Sdg * write, or we have reached our maximum cluster size, 5315455Sdg * then push the previous cluster. Otherwise try 5325455Sdg * reallocating to make it sequential. 5331541Srgrimes */ 5341541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 53534611Sdyson if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 53610541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 53712404Sdyson if (!async) 53812404Sdyson cluster_wbuild(vp, lblocksize, 53912404Sdyson vp->v_cstart, cursize); 54010541Sdyson } else { 54110541Sdyson struct buf **bpp, **endbp; 54210541Sdyson struct cluster_save *buflist; 54310541Sdyson 54410541Sdyson buflist = cluster_collectbufs(vp, bp); 54510541Sdyson endbp = &buflist->bs_children 54610541Sdyson [buflist->bs_nchildren - 1]; 54710541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 54810541Sdyson /* 54910541Sdyson * Failed, push the previous cluster. 55010541Sdyson */ 55110541Sdyson for (bpp = buflist->bs_children; 55210541Sdyson bpp < endbp; bpp++) 55310541Sdyson brelse(*bpp); 55410541Sdyson free(buflist, M_SEGMENT); 55512404Sdyson cluster_wbuild(vp, lblocksize, 55612404Sdyson vp->v_cstart, cursize); 55710541Sdyson } else { 55810541Sdyson /* 55910541Sdyson * Succeeded, keep building cluster. 56010541Sdyson */ 56110541Sdyson for (bpp = buflist->bs_children; 56210541Sdyson bpp <= endbp; bpp++) 56310541Sdyson bdwrite(*bpp); 56410541Sdyson free(buflist, M_SEGMENT); 56510541Sdyson vp->v_lastw = lbn; 56610541Sdyson vp->v_lasta = bp->b_blkno; 56710541Sdyson return; 56810541Sdyson } 56910541Sdyson } 5701541Srgrimes } 5711541Srgrimes /* 5725455Sdg * Consider beginning a cluster. If at end of file, make 5735455Sdg * cluster as large as possible, otherwise find size of 5745455Sdg * existing cluster. 5751541Srgrimes */ 57632286Sdyson if ((vp->v_type == VREG) && 57734611Sdyson ((u_quad_t) bp->b_offset + lblocksize) != filesize && 5787613Sdg (bp->b_blkno == bp->b_lblkno) && 57910551Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 58010541Sdyson bp->b_blkno == -1)) { 5811541Srgrimes bawrite(bp); 5821541Srgrimes vp->v_clen = 0; 5831541Srgrimes vp->v_lasta = bp->b_blkno; 5841541Srgrimes vp->v_cstart = lbn + 1; 5851541Srgrimes vp->v_lastw = lbn; 5861541Srgrimes return; 5871541Srgrimes } 5885455Sdg vp->v_clen = maxclen; 58912404Sdyson if (!async && maxclen == 0) { /* I/O not contiguous */ 5901541Srgrimes vp->v_cstart = lbn + 1; 59113490Sdyson bawrite(bp); 5925455Sdg } else { /* Wait for rest of cluster */ 5931541Srgrimes vp->v_cstart = lbn; 5945455Sdg bdwrite(bp); 5951541Srgrimes } 5961541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 5971541Srgrimes /* 5981541Srgrimes * At end of cluster, write it out. 5991541Srgrimes */ 60012404Sdyson bdwrite(bp); 60113490Sdyson cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 6021541Srgrimes vp->v_clen = 0; 6031541Srgrimes vp->v_cstart = lbn + 1; 6041541Srgrimes } else 6051541Srgrimes /* 6065455Sdg * In the middle of a cluster, so just delay the I/O for now. 6071541Srgrimes */ 6081541Srgrimes bdwrite(bp); 6091541Srgrimes vp->v_lastw = lbn; 6101541Srgrimes vp->v_lasta = bp->b_blkno; 6111541Srgrimes} 6121541Srgrimes 6131541Srgrimes 6141541Srgrimes/* 6151541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 6161541Srgrimes * The last lbn argument is the current block on which I/O is being 6171541Srgrimes * performed. Check to see that it doesn't fall in the middle of 6181541Srgrimes * the current block (if last_bp == NULL). 6191541Srgrimes */ 62012767Sdysonint 62112404Sdysoncluster_wbuild(vp, size, start_lbn, len) 6221541Srgrimes struct vnode *vp; 6231541Srgrimes long size; 6241541Srgrimes daddr_t start_lbn; 6251541Srgrimes int len; 6261541Srgrimes{ 62712404Sdyson struct buf *bp, *tbp; 6285455Sdg int i, j, s; 62912767Sdyson int totalwritten = 0; 63012404Sdyson int dbsize = btodb(size); 63135595Sbde 63235595Sbde if (vp->v_maxio == 0) 63335595Sbde vp->v_maxio = DFLTPHYS; 63412767Sdyson while (len > 0) { 63512767Sdyson s = splbio(); 63632286Sdyson if (((tbp = gbincore(vp, start_lbn)) == NULL) || 63734630Sjulian ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { 63812767Sdyson ++start_lbn; 63912767Sdyson --len; 64012767Sdyson splx(s); 64112767Sdyson continue; 64212767Sdyson } 64312767Sdyson bremfree(tbp); 64412767Sdyson tbp->b_flags |= B_BUSY; 64512767Sdyson tbp->b_flags &= ~B_DONE; 64612767Sdyson splx(s); 6471541Srgrimes 6481541Srgrimes /* 6495455Sdg * Extra memory in the buffer, punt on this buffer. XXX we could 6505455Sdg * handle this in most cases, but we would have to push the extra 6515455Sdg * memory down to after our max possible cluster size and then 6525455Sdg * potentially pull it back up if the cluster was terminated 6535455Sdg * prematurely--too much hassle. 6541541Srgrimes */ 65514319Sdyson if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 65634630Sjulian (tbp->b_bcount != tbp->b_bufsize) || 65734630Sjulian (tbp->b_bcount != size) || 65834630Sjulian (len == 1) || 65942957Sdillon ((bp = trypbuf(&cluster_pbuf_freecnt)) == NULL)) { 66012767Sdyson totalwritten += tbp->b_bufsize; 66112767Sdyson bawrite(tbp); 66212767Sdyson ++start_lbn; 66312767Sdyson --len; 66412767Sdyson continue; 66512767Sdyson } 66612404Sdyson 66734630Sjulian /* 66834630Sjulian * We got a pbuf to make the cluster in. 66934630Sjulian * so initialise it. 67034630Sjulian */ 67112767Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 67212767Sdyson bp->b_bcount = 0; 67312767Sdyson bp->b_bufsize = 0; 67412767Sdyson bp->b_npages = 0; 67517304Sdyson if (tbp->b_wcred != NOCRED) { 67617304Sdyson bp->b_wcred = tbp->b_wcred; 67717304Sdyson crhold(bp->b_wcred); 67817304Sdyson } 6791541Srgrimes 68012767Sdyson bp->b_blkno = tbp->b_blkno; 68112767Sdyson bp->b_lblkno = tbp->b_lblkno; 68234611Sdyson bp->b_offset = tbp->b_offset; 68337467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 68437467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 68532286Sdyson bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | 68634694Sdyson (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 68712767Sdyson bp->b_iodone = cluster_callback; 68812767Sdyson pbgetvp(vp, bp); 68934630Sjulian /* 69034630Sjulian * From this location in the file, scan forward to see 69134630Sjulian * if there are buffers with adjacent data that need to 69234630Sjulian * be written as well. 69334630Sjulian */ 69412767Sdyson for (i = 0; i < len; ++i, ++start_lbn) { 69534630Sjulian if (i != 0) { /* If not the first buffer */ 69612767Sdyson s = splbio(); 69734630Sjulian /* 69834630Sjulian * If the adjacent data is not even in core it 69934630Sjulian * can't need to be written. 70034630Sjulian */ 70112767Sdyson if ((tbp = gbincore(vp, start_lbn)) == NULL) { 70212767Sdyson splx(s); 70312767Sdyson break; 70412767Sdyson } 7051541Srgrimes 70634630Sjulian /* 70734630Sjulian * If it IS in core, but has different 70834630Sjulian * characteristics, don't cluster with it. 70934630Sjulian */ 71034630Sjulian if ((tbp->b_flags & 71134694Sdyson (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY | 71234694Sdyson B_DELWRI | B_NEEDCOMMIT)) 71334694Sdyson != (B_DELWRI | B_CLUSTEROK | 71434694Sdyson (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) { 71512767Sdyson splx(s); 71612767Sdyson break; 71712767Sdyson } 71812767Sdyson 71917304Sdyson if (tbp->b_wcred != bp->b_wcred) { 72017304Sdyson splx(s); 72117304Sdyson break; 72217304Sdyson } 72317304Sdyson 72434630Sjulian /* 72534630Sjulian * Check that the combined cluster 72634630Sjulian * would make sense with regard to pages 72734630Sjulian * and would not be too large 72834630Sjulian */ 72912767Sdyson if ((tbp->b_bcount != size) || 73034630Sjulian ((bp->b_blkno + (dbsize * i)) != 73134694Sdyson tbp->b_blkno) || 73234630Sjulian ((tbp->b_npages + bp->b_npages) > 73334630Sjulian (vp->v_maxio / PAGE_SIZE))) { 73412767Sdyson splx(s); 73512767Sdyson break; 73612767Sdyson } 73734630Sjulian /* 73834630Sjulian * Ok, it's passed all the tests, 73934630Sjulian * so remove it from the free list 74034630Sjulian * and mark it busy. We will use it. 74134630Sjulian */ 74212767Sdyson bremfree(tbp); 74312767Sdyson tbp->b_flags |= B_BUSY; 74412767Sdyson tbp->b_flags &= ~B_DONE; 74512404Sdyson splx(s); 74634630Sjulian } /* end of code for non-first buffers only */ 74734266Sjulian /* check for latent dependencies to be handled */ 74834266Sjulian if ((LIST_FIRST(&tbp->b_dep)) != NULL && 74934266Sjulian bioops.io_start) 75034266Sjulian (*bioops.io_start)(tbp); 75134630Sjulian /* 75234630Sjulian * If the IO is via the VM then we do some 75334630Sjulian * special VM hackery. (yuck) 75434630Sjulian */ 75513490Sdyson if (tbp->b_flags & B_VMIO) { 75632937Sdyson vm_page_t m; 75732937Sdyson 75834630Sjulian if (i != 0) { /* if not first buffer */ 75932937Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 76032937Sdyson m = tbp->b_pages[j]; 76132937Sdyson if (m->flags & PG_BUSY) 76232937Sdyson goto finishcluster; 76332937Sdyson } 76432937Sdyson } 76532937Sdyson 76613490Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 76713490Sdyson m = tbp->b_pages[j]; 76838799Sdfr vm_page_io_start(m); 76938517Sdfr vm_object_pip_add(m->object, 1); 77013490Sdyson if ((bp->b_npages == 0) || 77134630Sjulian (bp->b_pages[bp->b_npages - 1] != m)) { 77213490Sdyson bp->b_pages[bp->b_npages] = m; 77313490Sdyson bp->b_npages++; 77413490Sdyson } 77512767Sdyson } 77612767Sdyson } 77712767Sdyson bp->b_bcount += size; 77812767Sdyson bp->b_bufsize += size; 7791541Srgrimes 78038299Sdfr s = splbio(); 78144679Sjulian bundirty(tbp); 78244679Sjulian tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR); 78312767Sdyson tbp->b_flags |= B_ASYNC; 78412767Sdyson reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 78512767Sdyson ++tbp->b_vp->v_numoutput; 78638299Sdfr splx(s); 78712767Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 78812767Sdyson tbp, b_cluster.cluster_entry); 7891541Srgrimes } 79032937Sdyson finishcluster: 79112767Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 79212767Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 79320054Sdyson if (bp->b_bufsize > bp->b_kvasize) 79437559Sbde panic( 79537559Sbde "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 79637559Sbde bp->b_bufsize, bp->b_kvasize); 79720054Sdyson bp->b_kvasize = bp->b_bufsize; 79812767Sdyson totalwritten += bp->b_bufsize; 79917304Sdyson bp->b_dirtyoff = 0; 80017304Sdyson bp->b_dirtyend = bp->b_bufsize; 80112767Sdyson bawrite(bp); 8021541Srgrimes 80312767Sdyson len -= i; 8041541Srgrimes } 80512767Sdyson return totalwritten; 8061541Srgrimes} 8071541Srgrimes 8081541Srgrimes/* 8091541Srgrimes * Collect together all the buffers in a cluster. 8101541Srgrimes * Plus add one additional buffer. 8111541Srgrimes */ 81212973Sbdestatic struct cluster_save * 8131541Srgrimescluster_collectbufs(vp, last_bp) 8141541Srgrimes struct vnode *vp; 8151541Srgrimes struct buf *last_bp; 8161541Srgrimes{ 8171541Srgrimes struct cluster_save *buflist; 81841205Smckusick struct buf *bp; 8195455Sdg daddr_t lbn; 8201541Srgrimes int i, len; 8211541Srgrimes 8221541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 8231541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 8241541Srgrimes M_SEGMENT, M_WAITOK); 8251541Srgrimes buflist->bs_nchildren = 0; 8265455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 82741205Smckusick for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 82841205Smckusick (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 82941205Smckusick buflist->bs_children[i] = bp; 83041205Smckusick if (bp->b_blkno == bp->b_lblkno) 83141205Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 83241205Smckusick NULL, NULL); 83341205Smckusick } 83441529Smckusick buflist->bs_children[i] = bp = last_bp; 83541529Smckusick if (bp->b_blkno == bp->b_lblkno) 83641529Smckusick VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 83741529Smckusick NULL, NULL); 8381541Srgrimes buflist->bs_nchildren = i + 1; 8391541Srgrimes return (buflist); 8401541Srgrimes} 841