vfs_cluster.c revision 38299
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 3638299Sdfr * $Id: vfs_cluster.c,v 1.67 1998/08/06 08:33:18 dfr Exp $ 371541Srgrimes */ 381541Srgrimes 3932929Seivind#include "opt_debug_cluster.h" 4032929Seivind 411541Srgrimes#include <sys/param.h> 421549Srgrimes#include <sys/systm.h> 431541Srgrimes#include <sys/proc.h> 441541Srgrimes#include <sys/buf.h> 451541Srgrimes#include <sys/vnode.h> 461541Srgrimes#include <sys/mount.h> 471541Srgrimes#include <sys/resourcevar.h> 486621Sdg#include <vm/vm.h> 4912662Sdg#include <vm/vm_prot.h> 5010541Sdyson#include <vm/vm_object.h> 5110541Sdyson#include <vm/vm_page.h> 521541Srgrimes 5321002Sdyson#if defined(CLUSTERDEBUG) 5421002Sdyson#include <sys/sysctl.h> 5521002Sdyson#include <sys/kernel.h> 5621002Sdysonstatic int rcluster= 0; 5724484SbdeSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 5821002Sdyson#endif 5921002Sdyson 6012973Sbde#ifdef notyet_block_reallocation_enabled 6112973Sbdestatic struct cluster_save * 6212973Sbde cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 6312973Sbde#endif 6412973Sbdestatic struct buf * 6512973Sbde cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 6621002Sdyson daddr_t blkno, long size, int run, struct buf *fbp)); 671541Srgrimes 6812973Sbdeextern vm_page_t bogus_page; 695455Sdg 701541Srgrimes/* 7121002Sdyson * Maximum number of blocks for read-ahead. 721541Srgrimes */ 7321002Sdyson#define MAXRA 32 745455Sdg 751541Srgrimes/* 7621002Sdyson * This replaces bread. 7710541Sdyson */ 781549Srgrimesint 7921002Sdysoncluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 801541Srgrimes struct vnode *vp; 811541Srgrimes u_quad_t filesize; 821541Srgrimes daddr_t lblkno; 831541Srgrimes long size; 841541Srgrimes struct ucred *cred; 8521002Sdyson long totread; 8621002Sdyson int seqcount; 871541Srgrimes struct buf **bpp; 881541Srgrimes{ 8921002Sdyson struct buf *bp, *rbp, *reqbp; 9031016Sphk daddr_t blkno, origblkno; 9121002Sdyson int error, num_ra; 9210541Sdyson int i; 9321002Sdyson int maxra, racluster; 9421002Sdyson long origtotread; 951541Srgrimes 961541Srgrimes error = 0; 9732724Sdyson if (vp->v_maxio == 0) 9832724Sdyson vp->v_maxio = DFLTPHYS; 9921002Sdyson 1005455Sdg /* 10121002Sdyson * Try to limit the amount of read-ahead by a few 10221002Sdyson * ad-hoc parameters. This needs work!!! 10321002Sdyson */ 10432724Sdyson racluster = vp->v_maxio/size; 10521002Sdyson maxra = 2 * racluster + (totread / size); 10621002Sdyson if (maxra > MAXRA) 10721002Sdyson maxra = MAXRA; 10821002Sdyson if (maxra > nbuf/8) 10921002Sdyson maxra = nbuf/8; 11021002Sdyson 11121002Sdyson /* 1125455Sdg * get the requested block 1135455Sdg */ 11421002Sdyson *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 11521002Sdyson origblkno = lblkno; 11621002Sdyson origtotread = totread; 11712767Sdyson 1185455Sdg /* 1195455Sdg * if it is in the cache, then check to see if the reads have been 1205455Sdg * sequential. If they have, then try some read-ahead, otherwise 1215455Sdg * back-off on prospective read-aheads. 1225455Sdg */ 1231541Srgrimes if (bp->b_flags & B_CACHE) { 12421002Sdyson if (!seqcount) { 1255455Sdg return 0; 12621002Sdyson } else if ((bp->b_flags & B_RAM) == 0) { 12721002Sdyson return 0; 12821002Sdyson } else { 12921002Sdyson int s; 13021002Sdyson struct buf *tbp; 13121002Sdyson bp->b_flags &= ~B_RAM; 13221002Sdyson /* 13321002Sdyson * We do the spl here so that there is no window 13421002Sdyson * between the incore and the b_usecount increment 13521002Sdyson * below. We opt to keep the spl out of the loop 13621002Sdyson * for efficiency. 13721002Sdyson */ 13821002Sdyson s = splbio(); 13921002Sdyson for(i=1;i<maxra;i++) { 14021002Sdyson 14121002Sdyson if (!(tbp = incore(vp, lblkno+i))) { 14221002Sdyson break; 14321002Sdyson } 14421002Sdyson 14521002Sdyson /* 14621002Sdyson * Set another read-ahead mark so we know to check 14721002Sdyson * again. 14821002Sdyson */ 14921002Sdyson if (((i % racluster) == (racluster - 1)) || 15021002Sdyson (i == (maxra - 1))) 15121002Sdyson tbp->b_flags |= B_RAM; 15221002Sdyson 15334694Sdyson if ((tbp->b_usecount < 1) && 15434206Sdyson ((tbp->b_flags & B_BUSY) == 0) && 15534206Sdyson (tbp->b_qindex == QUEUE_LRU)) { 15634206Sdyson TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist); 15734206Sdyson TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist); 15821002Sdyson } 15921002Sdyson } 16021002Sdyson splx(s); 16121002Sdyson if (i >= maxra) { 1625839Sdg return 0; 16310541Sdyson } 16421002Sdyson lblkno += i; 16521002Sdyson } 16621002Sdyson reqbp = bp = NULL; 16721002Sdyson } else { 16834611Sdyson off_t firstread; 16934611Sdyson firstread = bp->b_offset; 17034694Sdyson#ifdef DIAGNOSTIC 17134694Sdyson if (bp->b_offset == NOOFFSET) 17234694Sdyson panic("cluster_read: no buffer offset"); 17334694Sdyson#endif 17421002Sdyson if (firstread + totread > filesize) 17521002Sdyson totread = filesize - firstread; 17621002Sdyson if (totread > size) { 17721002Sdyson int nblks = 0; 17821002Sdyson int ncontigafter; 17921002Sdyson while (totread > 0) { 18021002Sdyson nblks++; 18121002Sdyson totread -= size; 18221002Sdyson } 18321002Sdyson if (nblks == 1) 18421002Sdyson goto single_block_read; 18521002Sdyson if (nblks > racluster) 18621002Sdyson nblks = racluster; 18721002Sdyson 18821002Sdyson error = VOP_BMAP(vp, lblkno, NULL, 18921002Sdyson &blkno, &ncontigafter, NULL); 19021002Sdyson if (error) 19121002Sdyson goto single_block_read; 19221002Sdyson if (blkno == -1) 19321002Sdyson goto single_block_read; 19421002Sdyson if (ncontigafter == 0) 19521002Sdyson goto single_block_read; 19621002Sdyson if (ncontigafter + 1 < nblks) 19721002Sdyson nblks = ncontigafter + 1; 19821002Sdyson 19921002Sdyson bp = cluster_rbuild(vp, filesize, lblkno, 20021002Sdyson blkno, size, nblks, bp); 20134694Sdyson lblkno += (bp->b_bufsize / size); 20210541Sdyson } else { 20321002Sdysonsingle_block_read: 20421002Sdyson /* 20521002Sdyson * if it isn't in the cache, then get a chunk from 20621002Sdyson * disk if sequential, otherwise just get the block. 20721002Sdyson */ 20821002Sdyson bp->b_flags |= B_READ | B_RAM; 20910541Sdyson lblkno += 1; 2108876Srgrimes } 2111541Srgrimes } 2125455Sdg 2135455Sdg /* 2145455Sdg * if we have been doing sequential I/O, then do some read-ahead 2155455Sdg */ 21621002Sdyson rbp = NULL; 21721002Sdyson if (seqcount && (lblkno < (origblkno + seqcount))) { 2181541Srgrimes /* 21921002Sdyson * we now build the read-ahead buffer if it is desirable. 2201541Srgrimes */ 22121002Sdyson if (((u_quad_t)(lblkno + 1) * size) <= filesize && 22221002Sdyson !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 22321002Sdyson blkno != -1) { 22421002Sdyson int nblksread; 22521002Sdyson int ntoread = num_ra + 1; 22621002Sdyson nblksread = (origtotread + size - 1) / size; 22721002Sdyson if (seqcount < nblksread) 22821002Sdyson seqcount = nblksread; 22921002Sdyson if (seqcount < ntoread) 23021002Sdyson ntoread = seqcount; 23121002Sdyson if (num_ra) { 23221002Sdyson rbp = cluster_rbuild(vp, filesize, lblkno, 23321002Sdyson blkno, size, ntoread, NULL); 23421002Sdyson } else { 23521002Sdyson rbp = getblk(vp, lblkno, size, 0, 0); 23621002Sdyson rbp->b_flags |= B_READ | B_ASYNC | B_RAM; 23721002Sdyson rbp->b_blkno = blkno; 2385455Sdg } 2391541Srgrimes } 2405455Sdg } 2411541Srgrimes 2425455Sdg /* 24310541Sdyson * handle the synchronous read 2445455Sdg */ 2455455Sdg if (bp) { 24621002Sdyson#if defined(CLUSTERDEBUG) 24736275Sdyson if (rcluster) 24837951Sbde printf("S(%ld,%ld,%d) ", 24937951Sbde (long)bp->b_lblkno, bp->b_bcount, seqcount); 25021002Sdyson#endif 25136275Sdyson if ((bp->b_flags & B_CLUSTER) == 0) 25236275Sdyson vfs_busy_pages(bp, 0); 25337384Sjulian error = VOP_STRATEGY(vp, bp); 25436275Sdyson curproc->p_stats->p_ru.ru_inblock++; 2555455Sdg } 25634611Sdyson 2575455Sdg /* 2585455Sdg * and if we have read-aheads, do them too 2595455Sdg */ 2605455Sdg if (rbp) { 26113490Sdyson if (error) { 2621541Srgrimes rbp->b_flags &= ~(B_ASYNC | B_READ); 2631541Srgrimes brelse(rbp); 26413490Sdyson } else if (rbp->b_flags & B_CACHE) { 26513490Sdyson rbp->b_flags &= ~(B_ASYNC | B_READ); 26613490Sdyson bqrelse(rbp); 2675455Sdg } else { 26821002Sdyson#if defined(CLUSTERDEBUG) 26921002Sdyson if (rcluster) { 27021002Sdyson if (bp) 27137951Sbde printf("A+(%ld,%ld,%ld,%d) ", 27237951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27337951Sbde (long)(rbp->b_lblkno - origblkno), 27437951Sbde seqcount); 27521002Sdyson else 27637951Sbde printf("A(%ld,%ld,%ld,%d) ", 27737951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27837951Sbde (long)(rbp->b_lblkno - origblkno), 27937951Sbde seqcount); 28021002Sdyson } 28121002Sdyson#endif 28221002Sdyson 28310541Sdyson if ((rbp->b_flags & B_CLUSTER) == 0) 28410541Sdyson vfs_busy_pages(rbp, 0); 28537384Sjulian (void) VOP_STRATEGY(vp, rbp); 2865455Sdg curproc->p_stats->p_ru.ru_inblock++; 2875455Sdg } 2885455Sdg } 28921002Sdyson if (reqbp) 29021002Sdyson return (biowait(reqbp)); 29121002Sdyson else 29221002Sdyson return (error); 2931541Srgrimes} 2941541Srgrimes 2951541Srgrimes/* 2961541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 2971541Srgrimes * read ahead. We will read as many blocks as possible sequentially 2981541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 2991541Srgrimes */ 30010541Sdysonstatic struct buf * 30121002Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 3021541Srgrimes struct vnode *vp; 3031541Srgrimes u_quad_t filesize; 3041541Srgrimes daddr_t lbn; 3051541Srgrimes daddr_t blkno; 3061541Srgrimes long size; 3071541Srgrimes int run; 30821002Sdyson struct buf *fbp; 3091541Srgrimes{ 31010541Sdyson struct buf *bp, *tbp; 3111541Srgrimes daddr_t bn; 31238135Sdfr int i, inc, j, s; 3131541Srgrimes 3141541Srgrimes#ifdef DIAGNOSTIC 3151541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 31637951Sbde panic("cluster_rbuild: size %ld != filesize %ld\n", 3175455Sdg size, vp->v_mount->mnt_stat.f_iosize); 3181541Srgrimes#endif 31912767Sdyson /* 32012767Sdyson * avoid a division 32112767Sdyson */ 32212767Sdyson while ((u_quad_t) size * (lbn + run) > filesize) { 3231541Srgrimes --run; 32412767Sdyson } 32510541Sdyson 32621002Sdyson if (fbp) { 32721002Sdyson tbp = fbp; 32821002Sdyson tbp->b_flags |= B_READ; 32921002Sdyson } else { 33021002Sdyson tbp = getblk(vp, lbn, size, 0, 0); 33121002Sdyson if (tbp->b_flags & B_CACHE) 33221002Sdyson return tbp; 33321002Sdyson tbp->b_flags |= B_ASYNC | B_READ | B_RAM; 33421002Sdyson } 33510541Sdyson 33610541Sdyson tbp->b_blkno = blkno; 33716086Sdyson if( (tbp->b_flags & B_MALLOC) || 33816086Sdyson ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 33910541Sdyson return tbp; 34010541Sdyson 34110541Sdyson bp = trypbuf(); 34210541Sdyson if (bp == 0) 34310541Sdyson return tbp; 34410541Sdyson 34537467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 34637467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 34710541Sdyson bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 3485455Sdg bp->b_iodone = cluster_callback; 3495455Sdg bp->b_blkno = blkno; 3505455Sdg bp->b_lblkno = lbn; 35134611Sdyson bp->b_offset = tbp->b_offset; 35234694Sdyson#ifdef DIAGNOSTIC 35334694Sdyson if (bp->b_offset == NOOFFSET) 35434694Sdyson panic("cluster_rbuild: no buffer offset"); 35534694Sdyson#endif 3565455Sdg pbgetvp(vp, bp); 3571541Srgrimes 35812404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 3591541Srgrimes 3605455Sdg bp->b_bcount = 0; 3615455Sdg bp->b_bufsize = 0; 3625455Sdg bp->b_npages = 0; 3635455Sdg 36432724Sdyson if (vp->v_maxio == 0) 36532724Sdyson vp->v_maxio = DFLTPHYS; 3661541Srgrimes inc = btodb(size); 36710541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3685455Sdg if (i != 0) { 36912767Sdyson if ((bp->b_npages * PAGE_SIZE) + 37032724Sdyson round_page(size) > vp->v_maxio) 37110541Sdyson break; 37210978Sdyson 37334611Sdyson if (tbp = incore(vp, lbn + i)) { 37434611Sdyson if (tbp->b_flags & B_BUSY) 37534611Sdyson break; 37612767Sdyson 37734611Sdyson for (j = 0; j < tbp->b_npages; j++) 37834611Sdyson if (tbp->b_pages[j]->valid) 37934611Sdyson break; 38034611Sdyson 38134611Sdyson if (j != tbp->b_npages) 38234611Sdyson break; 38334611Sdyson 38434611Sdyson if (tbp->b_bcount != size) 38534611Sdyson break; 38634611Sdyson } 38734611Sdyson 3885455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 38910541Sdyson 3905455Sdg if ((tbp->b_flags & B_CACHE) || 39110541Sdyson (tbp->b_flags & B_VMIO) == 0) { 39213490Sdyson bqrelse(tbp); 3935455Sdg break; 3945455Sdg } 39510541Sdyson 39634611Sdyson for (j = 0;j < tbp->b_npages; j++) 39734611Sdyson if (tbp->b_pages[j]->valid) 39810541Sdyson break; 39910541Sdyson 40010541Sdyson if (j != tbp->b_npages) { 40134611Sdyson bqrelse(tbp); 40210541Sdyson break; 40310541Sdyson } 40410541Sdyson 40521002Sdyson if ((fbp && (i == 1)) || (i == (run - 1))) 40621002Sdyson tbp->b_flags |= B_RAM; 40710541Sdyson tbp->b_flags |= B_READ | B_ASYNC; 40812767Sdyson if (tbp->b_blkno == tbp->b_lblkno) { 40910541Sdyson tbp->b_blkno = bn; 41010541Sdyson } else if (tbp->b_blkno != bn) { 41110541Sdyson brelse(tbp); 41210541Sdyson break; 41310541Sdyson } 4141541Srgrimes } 41512404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 41612404Sdyson tbp, b_cluster.cluster_entry); 4175455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 41810541Sdyson vm_page_t m; 41910541Sdyson m = tbp->b_pages[j]; 42038135Sdfr s = splvm(); 42110541Sdyson ++m->busy; 42210541Sdyson ++m->object->paging_in_progress; 42338135Sdfr splx(s); 42410541Sdyson if ((bp->b_npages == 0) || 42512413Sdyson (bp->b_pages[bp->b_npages-1] != m)) { 42610541Sdyson bp->b_pages[bp->b_npages] = m; 42710541Sdyson bp->b_npages++; 42810541Sdyson } 42918737Sdyson if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 43018737Sdyson tbp->b_pages[j] = bogus_page; 4311541Srgrimes } 43210541Sdyson bp->b_bcount += tbp->b_bcount; 43310541Sdyson bp->b_bufsize += tbp->b_bufsize; 4341541Srgrimes } 43518737Sdyson 43618737Sdyson for(j=0;j<bp->b_npages;j++) { 43718737Sdyson if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 43818737Sdyson VM_PAGE_BITS_ALL) 43918737Sdyson bp->b_pages[j] = bogus_page; 44018737Sdyson } 44120054Sdyson if (bp->b_bufsize > bp->b_kvasize) 44237559Sbde panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 44337559Sbde bp->b_bufsize, bp->b_kvasize); 44420054Sdyson bp->b_kvasize = bp->b_bufsize; 44518737Sdyson 44610541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 44710541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 4485455Sdg return (bp); 4491541Srgrimes} 4501541Srgrimes 4511541Srgrimes/* 4521541Srgrimes * Cleanup after a clustered read or write. 4531541Srgrimes * This is complicated by the fact that any of the buffers might have 4541541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 4551541Srgrimes * that we will need to shift around. 4561541Srgrimes */ 4571541Srgrimesvoid 4581541Srgrimescluster_callback(bp) 4591541Srgrimes struct buf *bp; 4601541Srgrimes{ 46112404Sdyson struct buf *nbp, *tbp; 4621541Srgrimes int error = 0; 4631541Srgrimes 4641541Srgrimes /* 4651541Srgrimes * Must propogate errors to all the components. 4661541Srgrimes */ 4671541Srgrimes if (bp->b_flags & B_ERROR) 4681541Srgrimes error = bp->b_error; 4691541Srgrimes 47010541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 4711541Srgrimes /* 4721541Srgrimes * Move memory from the large cluster buffer into the component 4731541Srgrimes * buffers and mark IO as done on these. 4741541Srgrimes */ 47521002Sdyson for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 47612404Sdyson tbp; tbp = nbp) { 47721002Sdyson nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 4781541Srgrimes if (error) { 4791541Srgrimes tbp->b_flags |= B_ERROR; 4801541Srgrimes tbp->b_error = error; 48125135Sdfr } else 48225135Sdfr tbp->b_dirtyoff = tbp->b_dirtyend = 0; 4831541Srgrimes biodone(tbp); 4841541Srgrimes } 4855455Sdg relpbuf(bp); 4861541Srgrimes} 4871541Srgrimes 4881541Srgrimes/* 4891541Srgrimes * Do clustered write for FFS. 4901541Srgrimes * 4911541Srgrimes * Three cases: 4921541Srgrimes * 1. Write is not sequential (write asynchronously) 4931541Srgrimes * Write is sequential: 4941541Srgrimes * 2. beginning of cluster - begin cluster 4951541Srgrimes * 3. middle of a cluster - add to cluster 4961541Srgrimes * 4. end of a cluster - asynchronously write cluster 4971541Srgrimes */ 4981541Srgrimesvoid 4991541Srgrimescluster_write(bp, filesize) 5005455Sdg struct buf *bp; 5011541Srgrimes u_quad_t filesize; 5021541Srgrimes{ 5035455Sdg struct vnode *vp; 5045455Sdg daddr_t lbn; 5055455Sdg int maxclen, cursize; 5065455Sdg int lblocksize; 50712404Sdyson int async; 5081541Srgrimes 5095455Sdg vp = bp->b_vp; 51032724Sdyson if (vp->v_maxio == 0) 51132724Sdyson vp->v_maxio = DFLTPHYS; 51232286Sdyson if (vp->v_type == VREG) { 51332286Sdyson async = vp->v_mount->mnt_flag & MNT_ASYNC; 51432286Sdyson lblocksize = vp->v_mount->mnt_stat.f_iosize; 51532286Sdyson } else { 51632286Sdyson async = 0; 51732286Sdyson lblocksize = bp->b_bufsize; 51832286Sdyson } 5195455Sdg lbn = bp->b_lblkno; 5201541Srgrimes 52134694Sdyson#ifdef DIAGNOSTIC 52234694Sdyson if (bp->b_offset == NOOFFSET) 52334694Sdyson panic("cluster_write: no buffer offset"); 52434694Sdyson#endif 52534694Sdyson 5261541Srgrimes /* Initialize vnode to beginning of file. */ 5271541Srgrimes if (lbn == 0) 5281541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 5291541Srgrimes 5305455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 5315455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 53232724Sdyson maxclen = vp->v_maxio / lblocksize - 1; 5331541Srgrimes if (vp->v_clen != 0) { 5341541Srgrimes /* 5351541Srgrimes * Next block is not sequential. 5368876Srgrimes * 5371541Srgrimes * If we are not writing at end of file, the process 5385455Sdg * seeked to another point in the file since its last 5395455Sdg * write, or we have reached our maximum cluster size, 5405455Sdg * then push the previous cluster. Otherwise try 5415455Sdg * reallocating to make it sequential. 5421541Srgrimes */ 5431541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 54412973Sbde#ifndef notyet_block_reallocation_enabled 54534611Sdyson if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 54612404Sdyson lbn != vp->v_lastw + 1 || 54712404Sdyson vp->v_clen <= cursize) { 54812404Sdyson if (!async) 54912404Sdyson cluster_wbuild(vp, lblocksize, 55012404Sdyson vp->v_cstart, cursize); 55112404Sdyson } 55212404Sdyson#else 55324484Sbde if ((lbn + 1) * lblocksize != filesize || 55410541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 55512404Sdyson if (!async) 55612404Sdyson cluster_wbuild(vp, lblocksize, 55712404Sdyson vp->v_cstart, cursize); 55810541Sdyson } else { 55910541Sdyson struct buf **bpp, **endbp; 56010541Sdyson struct cluster_save *buflist; 56110541Sdyson 56210541Sdyson buflist = cluster_collectbufs(vp, bp); 56310541Sdyson endbp = &buflist->bs_children 56410541Sdyson [buflist->bs_nchildren - 1]; 56510541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 56610541Sdyson /* 56710541Sdyson * Failed, push the previous cluster. 56810541Sdyson */ 56910541Sdyson for (bpp = buflist->bs_children; 57010541Sdyson bpp < endbp; bpp++) 57110541Sdyson brelse(*bpp); 57210541Sdyson free(buflist, M_SEGMENT); 57312404Sdyson cluster_wbuild(vp, lblocksize, 57412404Sdyson vp->v_cstart, cursize); 57510541Sdyson } else { 57610541Sdyson /* 57710541Sdyson * Succeeded, keep building cluster. 57810541Sdyson */ 57910541Sdyson for (bpp = buflist->bs_children; 58010541Sdyson bpp <= endbp; bpp++) 58110541Sdyson bdwrite(*bpp); 58210541Sdyson free(buflist, M_SEGMENT); 58310541Sdyson vp->v_lastw = lbn; 58410541Sdyson vp->v_lasta = bp->b_blkno; 58510541Sdyson return; 58610541Sdyson } 58710541Sdyson } 58812973Sbde#endif /* notyet_block_reallocation_enabled */ 5891541Srgrimes } 5901541Srgrimes /* 5915455Sdg * Consider beginning a cluster. If at end of file, make 5925455Sdg * cluster as large as possible, otherwise find size of 5935455Sdg * existing cluster. 5941541Srgrimes */ 59532286Sdyson if ((vp->v_type == VREG) && 59634611Sdyson ((u_quad_t) bp->b_offset + lblocksize) != filesize && 5977613Sdg (bp->b_blkno == bp->b_lblkno) && 59810551Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 59910541Sdyson bp->b_blkno == -1)) { 6001541Srgrimes bawrite(bp); 6011541Srgrimes vp->v_clen = 0; 6021541Srgrimes vp->v_lasta = bp->b_blkno; 6031541Srgrimes vp->v_cstart = lbn + 1; 6041541Srgrimes vp->v_lastw = lbn; 6051541Srgrimes return; 6061541Srgrimes } 6075455Sdg vp->v_clen = maxclen; 60812404Sdyson if (!async && maxclen == 0) { /* I/O not contiguous */ 6091541Srgrimes vp->v_cstart = lbn + 1; 61013490Sdyson bawrite(bp); 6115455Sdg } else { /* Wait for rest of cluster */ 6121541Srgrimes vp->v_cstart = lbn; 6135455Sdg bdwrite(bp); 6141541Srgrimes } 6151541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 6161541Srgrimes /* 6171541Srgrimes * At end of cluster, write it out. 6181541Srgrimes */ 61912404Sdyson bdwrite(bp); 62013490Sdyson cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 6211541Srgrimes vp->v_clen = 0; 6221541Srgrimes vp->v_cstart = lbn + 1; 6231541Srgrimes } else 6241541Srgrimes /* 6255455Sdg * In the middle of a cluster, so just delay the I/O for now. 6261541Srgrimes */ 6271541Srgrimes bdwrite(bp); 6281541Srgrimes vp->v_lastw = lbn; 6291541Srgrimes vp->v_lasta = bp->b_blkno; 6301541Srgrimes} 6311541Srgrimes 6321541Srgrimes 6331541Srgrimes/* 6341541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 6351541Srgrimes * The last lbn argument is the current block on which I/O is being 6361541Srgrimes * performed. Check to see that it doesn't fall in the middle of 6371541Srgrimes * the current block (if last_bp == NULL). 6381541Srgrimes */ 63912767Sdysonint 64012404Sdysoncluster_wbuild(vp, size, start_lbn, len) 6411541Srgrimes struct vnode *vp; 6421541Srgrimes long size; 6431541Srgrimes daddr_t start_lbn; 6441541Srgrimes int len; 6451541Srgrimes{ 64612404Sdyson struct buf *bp, *tbp; 6475455Sdg int i, j, s; 64812767Sdyson int totalwritten = 0; 64912404Sdyson int dbsize = btodb(size); 65035595Sbde 65135595Sbde if (vp->v_maxio == 0) 65235595Sbde vp->v_maxio = DFLTPHYS; 65312767Sdyson while (len > 0) { 65412767Sdyson s = splbio(); 65532286Sdyson if (((tbp = gbincore(vp, start_lbn)) == NULL) || 65634630Sjulian ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { 65712767Sdyson ++start_lbn; 65812767Sdyson --len; 65912767Sdyson splx(s); 66012767Sdyson continue; 66112767Sdyson } 66212767Sdyson bremfree(tbp); 66312767Sdyson tbp->b_flags |= B_BUSY; 66412767Sdyson tbp->b_flags &= ~B_DONE; 66512767Sdyson splx(s); 6661541Srgrimes 6671541Srgrimes /* 6685455Sdg * Extra memory in the buffer, punt on this buffer. XXX we could 6695455Sdg * handle this in most cases, but we would have to push the extra 6705455Sdg * memory down to after our max possible cluster size and then 6715455Sdg * potentially pull it back up if the cluster was terminated 6725455Sdg * prematurely--too much hassle. 6731541Srgrimes */ 67414319Sdyson if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 67534630Sjulian (tbp->b_bcount != tbp->b_bufsize) || 67634630Sjulian (tbp->b_bcount != size) || 67734630Sjulian (len == 1) || 67834630Sjulian ((bp = trypbuf()) == NULL)) { 67912767Sdyson totalwritten += tbp->b_bufsize; 68012767Sdyson bawrite(tbp); 68112767Sdyson ++start_lbn; 68212767Sdyson --len; 68312767Sdyson continue; 68412767Sdyson } 68512404Sdyson 68634630Sjulian /* 68734630Sjulian * We got a pbuf to make the cluster in. 68834630Sjulian * so initialise it. 68934630Sjulian */ 69012767Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 69112767Sdyson bp->b_bcount = 0; 69212767Sdyson bp->b_bufsize = 0; 69312767Sdyson bp->b_npages = 0; 69417304Sdyson if (tbp->b_wcred != NOCRED) { 69517304Sdyson bp->b_wcred = tbp->b_wcred; 69617304Sdyson crhold(bp->b_wcred); 69717304Sdyson } 6981541Srgrimes 69912767Sdyson bp->b_blkno = tbp->b_blkno; 70012767Sdyson bp->b_lblkno = tbp->b_lblkno; 70134611Sdyson bp->b_offset = tbp->b_offset; 70237467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 70337467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 70432286Sdyson bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | 70534694Sdyson (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 70612767Sdyson bp->b_iodone = cluster_callback; 70712767Sdyson pbgetvp(vp, bp); 70834630Sjulian /* 70934630Sjulian * From this location in the file, scan forward to see 71034630Sjulian * if there are buffers with adjacent data that need to 71134630Sjulian * be written as well. 71234630Sjulian */ 71312767Sdyson for (i = 0; i < len; ++i, ++start_lbn) { 71434630Sjulian if (i != 0) { /* If not the first buffer */ 71512767Sdyson s = splbio(); 71634630Sjulian /* 71734630Sjulian * If the adjacent data is not even in core it 71834630Sjulian * can't need to be written. 71934630Sjulian */ 72012767Sdyson if ((tbp = gbincore(vp, start_lbn)) == NULL) { 72112767Sdyson splx(s); 72212767Sdyson break; 72312767Sdyson } 7241541Srgrimes 72534630Sjulian /* 72634630Sjulian * If it IS in core, but has different 72734630Sjulian * characteristics, don't cluster with it. 72834630Sjulian */ 72934630Sjulian if ((tbp->b_flags & 73034694Sdyson (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY | 73134694Sdyson B_DELWRI | B_NEEDCOMMIT)) 73234694Sdyson != (B_DELWRI | B_CLUSTEROK | 73334694Sdyson (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) { 73412767Sdyson splx(s); 73512767Sdyson break; 73612767Sdyson } 73712767Sdyson 73817304Sdyson if (tbp->b_wcred != bp->b_wcred) { 73917304Sdyson splx(s); 74017304Sdyson break; 74117304Sdyson } 74217304Sdyson 74334630Sjulian /* 74434630Sjulian * Check that the combined cluster 74534630Sjulian * would make sense with regard to pages 74634630Sjulian * and would not be too large 74734630Sjulian */ 74812767Sdyson if ((tbp->b_bcount != size) || 74934630Sjulian ((bp->b_blkno + (dbsize * i)) != 75034694Sdyson tbp->b_blkno) || 75134630Sjulian ((tbp->b_npages + bp->b_npages) > 75234630Sjulian (vp->v_maxio / PAGE_SIZE))) { 75312767Sdyson splx(s); 75412767Sdyson break; 75512767Sdyson } 75634630Sjulian /* 75734630Sjulian * Ok, it's passed all the tests, 75834630Sjulian * so remove it from the free list 75934630Sjulian * and mark it busy. We will use it. 76034630Sjulian */ 76112767Sdyson bremfree(tbp); 76212767Sdyson tbp->b_flags |= B_BUSY; 76312767Sdyson tbp->b_flags &= ~B_DONE; 76412404Sdyson splx(s); 76534630Sjulian } /* end of code for non-first buffers only */ 76634266Sjulian /* check for latent dependencies to be handled */ 76734266Sjulian if ((LIST_FIRST(&tbp->b_dep)) != NULL && 76834266Sjulian bioops.io_start) 76934266Sjulian (*bioops.io_start)(tbp); 77034630Sjulian /* 77134630Sjulian * If the IO is via the VM then we do some 77234630Sjulian * special VM hackery. (yuck) 77334630Sjulian */ 77413490Sdyson if (tbp->b_flags & B_VMIO) { 77532937Sdyson vm_page_t m; 77632937Sdyson 77734630Sjulian if (i != 0) { /* if not first buffer */ 77832937Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 77932937Sdyson m = tbp->b_pages[j]; 78032937Sdyson if (m->flags & PG_BUSY) 78132937Sdyson goto finishcluster; 78232937Sdyson } 78332937Sdyson } 78432937Sdyson 78513490Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 78613490Sdyson m = tbp->b_pages[j]; 78738135Sdfr s = splvm(); 78813490Sdyson ++m->busy; 78913490Sdyson ++m->object->paging_in_progress; 79038135Sdfr splx(s); 79113490Sdyson if ((bp->b_npages == 0) || 79234630Sjulian (bp->b_pages[bp->b_npages - 1] != m)) { 79313490Sdyson bp->b_pages[bp->b_npages] = m; 79413490Sdyson bp->b_npages++; 79513490Sdyson } 79612767Sdyson } 79712767Sdyson } 79812767Sdyson bp->b_bcount += size; 79912767Sdyson bp->b_bufsize += size; 8001541Srgrimes 80138299Sdfr s = splbio(); 80226664Sdyson --numdirtybuffers; 80312767Sdyson tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 80412767Sdyson tbp->b_flags |= B_ASYNC; 80512767Sdyson reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 80612767Sdyson ++tbp->b_vp->v_numoutput; 80738299Sdfr splx(s); 80812767Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 80912767Sdyson tbp, b_cluster.cluster_entry); 8101541Srgrimes } 81132937Sdyson finishcluster: 81212767Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 81312767Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 81420054Sdyson if (bp->b_bufsize > bp->b_kvasize) 81537559Sbde panic( 81637559Sbde "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 81737559Sbde bp->b_bufsize, bp->b_kvasize); 81820054Sdyson bp->b_kvasize = bp->b_bufsize; 81912767Sdyson totalwritten += bp->b_bufsize; 82017304Sdyson bp->b_dirtyoff = 0; 82117304Sdyson bp->b_dirtyend = bp->b_bufsize; 82212767Sdyson bawrite(bp); 8231541Srgrimes 82412767Sdyson len -= i; 8251541Srgrimes } 82612767Sdyson return totalwritten; 8271541Srgrimes} 8281541Srgrimes 82912973Sbde#ifdef notyet_block_reallocation_enabled 8301541Srgrimes/* 8311541Srgrimes * Collect together all the buffers in a cluster. 8321541Srgrimes * Plus add one additional buffer. 8331541Srgrimes */ 83412973Sbdestatic struct cluster_save * 8351541Srgrimescluster_collectbufs(vp, last_bp) 8361541Srgrimes struct vnode *vp; 8371541Srgrimes struct buf *last_bp; 8381541Srgrimes{ 8391541Srgrimes struct cluster_save *buflist; 8405455Sdg daddr_t lbn; 8411541Srgrimes int i, len; 8421541Srgrimes 8431541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 8441541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 8451541Srgrimes M_SEGMENT, M_WAITOK); 8461541Srgrimes buflist->bs_nchildren = 0; 8475455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 8481541Srgrimes for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 8495455Sdg (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, 8505455Sdg &buflist->bs_children[i]); 8511541Srgrimes buflist->bs_children[i] = last_bp; 8521541Srgrimes buflist->bs_nchildren = i + 1; 8531541Srgrimes return (buflist); 8541541Srgrimes} 85512973Sbde#endif /* notyet_block_reallocation_enabled */ 856