vfs_cluster.c revision 38799
11541Srgrimes/*- 21541Srgrimes * Copyright (c) 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 45455Sdg * Modifications/enhancements: 55455Sdg * Copyright (c) 1995 John S. Dyson. All rights reserved. 61541Srgrimes * 71541Srgrimes * Redistribution and use in source and binary forms, with or without 81541Srgrimes * modification, are permitted provided that the following conditions 91541Srgrimes * are met: 101541Srgrimes * 1. Redistributions of source code must retain the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer. 121541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 131541Srgrimes * notice, this list of conditions and the following disclaimer in the 141541Srgrimes * documentation and/or other materials provided with the distribution. 151541Srgrimes * 3. All advertising materials mentioning features or use of this software 161541Srgrimes * must display the following acknowledgement: 171541Srgrimes * This product includes software developed by the University of 181541Srgrimes * California, Berkeley and its contributors. 191541Srgrimes * 4. Neither the name of the University nor the names of its contributors 201541Srgrimes * may be used to endorse or promote products derived from this software 211541Srgrimes * without specific prior written permission. 221541Srgrimes * 231541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 241541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 251541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 261541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 271541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 281541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 291541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 301541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 311541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 321541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 331541Srgrimes * SUCH DAMAGE. 341541Srgrimes * 351541Srgrimes * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 3638799Sdfr * $Id: vfs_cluster.c,v 1.69 1998/08/24 08:39:39 dfr Exp $ 371541Srgrimes */ 381541Srgrimes 3932929Seivind#include "opt_debug_cluster.h" 4032929Seivind 411541Srgrimes#include <sys/param.h> 421549Srgrimes#include <sys/systm.h> 431541Srgrimes#include <sys/proc.h> 441541Srgrimes#include <sys/buf.h> 451541Srgrimes#include <sys/vnode.h> 461541Srgrimes#include <sys/mount.h> 471541Srgrimes#include <sys/resourcevar.h> 486621Sdg#include <vm/vm.h> 4912662Sdg#include <vm/vm_prot.h> 5010541Sdyson#include <vm/vm_object.h> 5110541Sdyson#include <vm/vm_page.h> 521541Srgrimes 5321002Sdyson#if defined(CLUSTERDEBUG) 5421002Sdyson#include <sys/sysctl.h> 5521002Sdyson#include <sys/kernel.h> 5621002Sdysonstatic int rcluster= 0; 5724484SbdeSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 5821002Sdyson#endif 5921002Sdyson 6012973Sbde#ifdef notyet_block_reallocation_enabled 6112973Sbdestatic struct cluster_save * 6212973Sbde cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 6312973Sbde#endif 6412973Sbdestatic struct buf * 6512973Sbde cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 6621002Sdyson daddr_t blkno, long size, int run, struct buf *fbp)); 671541Srgrimes 6812973Sbdeextern vm_page_t bogus_page; 695455Sdg 701541Srgrimes/* 7121002Sdyson * Maximum number of blocks for read-ahead. 721541Srgrimes */ 7321002Sdyson#define MAXRA 32 745455Sdg 751541Srgrimes/* 7621002Sdyson * This replaces bread. 7710541Sdyson */ 781549Srgrimesint 7921002Sdysoncluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 801541Srgrimes struct vnode *vp; 811541Srgrimes u_quad_t filesize; 821541Srgrimes daddr_t lblkno; 831541Srgrimes long size; 841541Srgrimes struct ucred *cred; 8521002Sdyson long totread; 8621002Sdyson int seqcount; 871541Srgrimes struct buf **bpp; 881541Srgrimes{ 8921002Sdyson struct buf *bp, *rbp, *reqbp; 9031016Sphk daddr_t blkno, origblkno; 9121002Sdyson int error, num_ra; 9210541Sdyson int i; 9321002Sdyson int maxra, racluster; 9421002Sdyson long origtotread; 951541Srgrimes 961541Srgrimes error = 0; 9732724Sdyson if (vp->v_maxio == 0) 9832724Sdyson vp->v_maxio = DFLTPHYS; 9921002Sdyson 1005455Sdg /* 10121002Sdyson * Try to limit the amount of read-ahead by a few 10221002Sdyson * ad-hoc parameters. This needs work!!! 10321002Sdyson */ 10432724Sdyson racluster = vp->v_maxio/size; 10521002Sdyson maxra = 2 * racluster + (totread / size); 10621002Sdyson if (maxra > MAXRA) 10721002Sdyson maxra = MAXRA; 10821002Sdyson if (maxra > nbuf/8) 10921002Sdyson maxra = nbuf/8; 11021002Sdyson 11121002Sdyson /* 1125455Sdg * get the requested block 1135455Sdg */ 11421002Sdyson *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 11521002Sdyson origblkno = lblkno; 11621002Sdyson origtotread = totread; 11712767Sdyson 1185455Sdg /* 1195455Sdg * if it is in the cache, then check to see if the reads have been 1205455Sdg * sequential. If they have, then try some read-ahead, otherwise 1215455Sdg * back-off on prospective read-aheads. 1225455Sdg */ 1231541Srgrimes if (bp->b_flags & B_CACHE) { 12421002Sdyson if (!seqcount) { 1255455Sdg return 0; 12621002Sdyson } else if ((bp->b_flags & B_RAM) == 0) { 12721002Sdyson return 0; 12821002Sdyson } else { 12921002Sdyson int s; 13021002Sdyson struct buf *tbp; 13121002Sdyson bp->b_flags &= ~B_RAM; 13221002Sdyson /* 13321002Sdyson * We do the spl here so that there is no window 13421002Sdyson * between the incore and the b_usecount increment 13521002Sdyson * below. We opt to keep the spl out of the loop 13621002Sdyson * for efficiency. 13721002Sdyson */ 13821002Sdyson s = splbio(); 13921002Sdyson for(i=1;i<maxra;i++) { 14021002Sdyson 14121002Sdyson if (!(tbp = incore(vp, lblkno+i))) { 14221002Sdyson break; 14321002Sdyson } 14421002Sdyson 14521002Sdyson /* 14621002Sdyson * Set another read-ahead mark so we know to check 14721002Sdyson * again. 14821002Sdyson */ 14921002Sdyson if (((i % racluster) == (racluster - 1)) || 15021002Sdyson (i == (maxra - 1))) 15121002Sdyson tbp->b_flags |= B_RAM; 15221002Sdyson 15334694Sdyson if ((tbp->b_usecount < 1) && 15434206Sdyson ((tbp->b_flags & B_BUSY) == 0) && 15534206Sdyson (tbp->b_qindex == QUEUE_LRU)) { 15634206Sdyson TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist); 15734206Sdyson TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist); 15821002Sdyson } 15921002Sdyson } 16021002Sdyson splx(s); 16121002Sdyson if (i >= maxra) { 1625839Sdg return 0; 16310541Sdyson } 16421002Sdyson lblkno += i; 16521002Sdyson } 16621002Sdyson reqbp = bp = NULL; 16721002Sdyson } else { 16834611Sdyson off_t firstread; 16934611Sdyson firstread = bp->b_offset; 17034694Sdyson#ifdef DIAGNOSTIC 17134694Sdyson if (bp->b_offset == NOOFFSET) 17234694Sdyson panic("cluster_read: no buffer offset"); 17334694Sdyson#endif 17421002Sdyson if (firstread + totread > filesize) 17521002Sdyson totread = filesize - firstread; 17621002Sdyson if (totread > size) { 17721002Sdyson int nblks = 0; 17821002Sdyson int ncontigafter; 17921002Sdyson while (totread > 0) { 18021002Sdyson nblks++; 18121002Sdyson totread -= size; 18221002Sdyson } 18321002Sdyson if (nblks == 1) 18421002Sdyson goto single_block_read; 18521002Sdyson if (nblks > racluster) 18621002Sdyson nblks = racluster; 18721002Sdyson 18821002Sdyson error = VOP_BMAP(vp, lblkno, NULL, 18921002Sdyson &blkno, &ncontigafter, NULL); 19021002Sdyson if (error) 19121002Sdyson goto single_block_read; 19221002Sdyson if (blkno == -1) 19321002Sdyson goto single_block_read; 19421002Sdyson if (ncontigafter == 0) 19521002Sdyson goto single_block_read; 19621002Sdyson if (ncontigafter + 1 < nblks) 19721002Sdyson nblks = ncontigafter + 1; 19821002Sdyson 19921002Sdyson bp = cluster_rbuild(vp, filesize, lblkno, 20021002Sdyson blkno, size, nblks, bp); 20134694Sdyson lblkno += (bp->b_bufsize / size); 20210541Sdyson } else { 20321002Sdysonsingle_block_read: 20421002Sdyson /* 20521002Sdyson * if it isn't in the cache, then get a chunk from 20621002Sdyson * disk if sequential, otherwise just get the block. 20721002Sdyson */ 20821002Sdyson bp->b_flags |= B_READ | B_RAM; 20910541Sdyson lblkno += 1; 2108876Srgrimes } 2111541Srgrimes } 2125455Sdg 2135455Sdg /* 2145455Sdg * if we have been doing sequential I/O, then do some read-ahead 2155455Sdg */ 21621002Sdyson rbp = NULL; 21721002Sdyson if (seqcount && (lblkno < (origblkno + seqcount))) { 2181541Srgrimes /* 21921002Sdyson * we now build the read-ahead buffer if it is desirable. 2201541Srgrimes */ 22121002Sdyson if (((u_quad_t)(lblkno + 1) * size) <= filesize && 22221002Sdyson !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 22321002Sdyson blkno != -1) { 22421002Sdyson int nblksread; 22521002Sdyson int ntoread = num_ra + 1; 22621002Sdyson nblksread = (origtotread + size - 1) / size; 22721002Sdyson if (seqcount < nblksread) 22821002Sdyson seqcount = nblksread; 22921002Sdyson if (seqcount < ntoread) 23021002Sdyson ntoread = seqcount; 23121002Sdyson if (num_ra) { 23221002Sdyson rbp = cluster_rbuild(vp, filesize, lblkno, 23321002Sdyson blkno, size, ntoread, NULL); 23421002Sdyson } else { 23521002Sdyson rbp = getblk(vp, lblkno, size, 0, 0); 23621002Sdyson rbp->b_flags |= B_READ | B_ASYNC | B_RAM; 23721002Sdyson rbp->b_blkno = blkno; 2385455Sdg } 2391541Srgrimes } 2405455Sdg } 2411541Srgrimes 2425455Sdg /* 24310541Sdyson * handle the synchronous read 2445455Sdg */ 2455455Sdg if (bp) { 24621002Sdyson#if defined(CLUSTERDEBUG) 24736275Sdyson if (rcluster) 24837951Sbde printf("S(%ld,%ld,%d) ", 24937951Sbde (long)bp->b_lblkno, bp->b_bcount, seqcount); 25021002Sdyson#endif 25136275Sdyson if ((bp->b_flags & B_CLUSTER) == 0) 25236275Sdyson vfs_busy_pages(bp, 0); 25337384Sjulian error = VOP_STRATEGY(vp, bp); 25436275Sdyson curproc->p_stats->p_ru.ru_inblock++; 2555455Sdg } 25634611Sdyson 2575455Sdg /* 2585455Sdg * and if we have read-aheads, do them too 2595455Sdg */ 2605455Sdg if (rbp) { 26113490Sdyson if (error) { 2621541Srgrimes rbp->b_flags &= ~(B_ASYNC | B_READ); 2631541Srgrimes brelse(rbp); 26413490Sdyson } else if (rbp->b_flags & B_CACHE) { 26513490Sdyson rbp->b_flags &= ~(B_ASYNC | B_READ); 26613490Sdyson bqrelse(rbp); 2675455Sdg } else { 26821002Sdyson#if defined(CLUSTERDEBUG) 26921002Sdyson if (rcluster) { 27021002Sdyson if (bp) 27137951Sbde printf("A+(%ld,%ld,%ld,%d) ", 27237951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27337951Sbde (long)(rbp->b_lblkno - origblkno), 27437951Sbde seqcount); 27521002Sdyson else 27637951Sbde printf("A(%ld,%ld,%ld,%d) ", 27737951Sbde (long)rbp->b_lblkno, rbp->b_bcount, 27837951Sbde (long)(rbp->b_lblkno - origblkno), 27937951Sbde seqcount); 28021002Sdyson } 28121002Sdyson#endif 28221002Sdyson 28310541Sdyson if ((rbp->b_flags & B_CLUSTER) == 0) 28410541Sdyson vfs_busy_pages(rbp, 0); 28537384Sjulian (void) VOP_STRATEGY(vp, rbp); 2865455Sdg curproc->p_stats->p_ru.ru_inblock++; 2875455Sdg } 2885455Sdg } 28921002Sdyson if (reqbp) 29021002Sdyson return (biowait(reqbp)); 29121002Sdyson else 29221002Sdyson return (error); 2931541Srgrimes} 2941541Srgrimes 2951541Srgrimes/* 2961541Srgrimes * If blocks are contiguous on disk, use this to provide clustered 2971541Srgrimes * read ahead. We will read as many blocks as possible sequentially 2981541Srgrimes * and then parcel them up into logical blocks in the buffer hash table. 2991541Srgrimes */ 30010541Sdysonstatic struct buf * 30121002Sdysoncluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 3021541Srgrimes struct vnode *vp; 3031541Srgrimes u_quad_t filesize; 3041541Srgrimes daddr_t lbn; 3051541Srgrimes daddr_t blkno; 3061541Srgrimes long size; 3071541Srgrimes int run; 30821002Sdyson struct buf *fbp; 3091541Srgrimes{ 31010541Sdyson struct buf *bp, *tbp; 3111541Srgrimes daddr_t bn; 31238135Sdfr int i, inc, j, s; 3131541Srgrimes 3141541Srgrimes#ifdef DIAGNOSTIC 3151541Srgrimes if (size != vp->v_mount->mnt_stat.f_iosize) 31637951Sbde panic("cluster_rbuild: size %ld != filesize %ld\n", 3175455Sdg size, vp->v_mount->mnt_stat.f_iosize); 3181541Srgrimes#endif 31912767Sdyson /* 32012767Sdyson * avoid a division 32112767Sdyson */ 32212767Sdyson while ((u_quad_t) size * (lbn + run) > filesize) { 3231541Srgrimes --run; 32412767Sdyson } 32510541Sdyson 32621002Sdyson if (fbp) { 32721002Sdyson tbp = fbp; 32821002Sdyson tbp->b_flags |= B_READ; 32921002Sdyson } else { 33021002Sdyson tbp = getblk(vp, lbn, size, 0, 0); 33121002Sdyson if (tbp->b_flags & B_CACHE) 33221002Sdyson return tbp; 33321002Sdyson tbp->b_flags |= B_ASYNC | B_READ | B_RAM; 33421002Sdyson } 33510541Sdyson 33610541Sdyson tbp->b_blkno = blkno; 33716086Sdyson if( (tbp->b_flags & B_MALLOC) || 33816086Sdyson ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 33910541Sdyson return tbp; 34010541Sdyson 34110541Sdyson bp = trypbuf(); 34210541Sdyson if (bp == 0) 34310541Sdyson return tbp; 34410541Sdyson 34537467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 34637467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 34710541Sdyson bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 3485455Sdg bp->b_iodone = cluster_callback; 3495455Sdg bp->b_blkno = blkno; 3505455Sdg bp->b_lblkno = lbn; 35134611Sdyson bp->b_offset = tbp->b_offset; 35234694Sdyson#ifdef DIAGNOSTIC 35334694Sdyson if (bp->b_offset == NOOFFSET) 35434694Sdyson panic("cluster_rbuild: no buffer offset"); 35534694Sdyson#endif 3565455Sdg pbgetvp(vp, bp); 3571541Srgrimes 35812404Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 3591541Srgrimes 3605455Sdg bp->b_bcount = 0; 3615455Sdg bp->b_bufsize = 0; 3625455Sdg bp->b_npages = 0; 3635455Sdg 36432724Sdyson if (vp->v_maxio == 0) 36532724Sdyson vp->v_maxio = DFLTPHYS; 3661541Srgrimes inc = btodb(size); 36710541Sdyson for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 3685455Sdg if (i != 0) { 36912767Sdyson if ((bp->b_npages * PAGE_SIZE) + 37032724Sdyson round_page(size) > vp->v_maxio) 37110541Sdyson break; 37210978Sdyson 37334611Sdyson if (tbp = incore(vp, lbn + i)) { 37434611Sdyson if (tbp->b_flags & B_BUSY) 37534611Sdyson break; 37612767Sdyson 37734611Sdyson for (j = 0; j < tbp->b_npages; j++) 37834611Sdyson if (tbp->b_pages[j]->valid) 37934611Sdyson break; 38034611Sdyson 38134611Sdyson if (j != tbp->b_npages) 38234611Sdyson break; 38334611Sdyson 38434611Sdyson if (tbp->b_bcount != size) 38534611Sdyson break; 38634611Sdyson } 38734611Sdyson 3885455Sdg tbp = getblk(vp, lbn + i, size, 0, 0); 38910541Sdyson 3905455Sdg if ((tbp->b_flags & B_CACHE) || 39110541Sdyson (tbp->b_flags & B_VMIO) == 0) { 39213490Sdyson bqrelse(tbp); 3935455Sdg break; 3945455Sdg } 39510541Sdyson 39634611Sdyson for (j = 0;j < tbp->b_npages; j++) 39734611Sdyson if (tbp->b_pages[j]->valid) 39810541Sdyson break; 39910541Sdyson 40010541Sdyson if (j != tbp->b_npages) { 40134611Sdyson bqrelse(tbp); 40210541Sdyson break; 40310541Sdyson } 40410541Sdyson 40521002Sdyson if ((fbp && (i == 1)) || (i == (run - 1))) 40621002Sdyson tbp->b_flags |= B_RAM; 40710541Sdyson tbp->b_flags |= B_READ | B_ASYNC; 40812767Sdyson if (tbp->b_blkno == tbp->b_lblkno) { 40910541Sdyson tbp->b_blkno = bn; 41010541Sdyson } else if (tbp->b_blkno != bn) { 41110541Sdyson brelse(tbp); 41210541Sdyson break; 41310541Sdyson } 4141541Srgrimes } 41512404Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 41612404Sdyson tbp, b_cluster.cluster_entry); 4175455Sdg for (j = 0; j < tbp->b_npages; j += 1) { 41810541Sdyson vm_page_t m; 41910541Sdyson m = tbp->b_pages[j]; 42038799Sdfr vm_page_io_start(m); 42138517Sdfr vm_object_pip_add(m->object, 1); 42210541Sdyson if ((bp->b_npages == 0) || 42312413Sdyson (bp->b_pages[bp->b_npages-1] != m)) { 42410541Sdyson bp->b_pages[bp->b_npages] = m; 42510541Sdyson bp->b_npages++; 42610541Sdyson } 42718737Sdyson if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 42818737Sdyson tbp->b_pages[j] = bogus_page; 4291541Srgrimes } 43010541Sdyson bp->b_bcount += tbp->b_bcount; 43110541Sdyson bp->b_bufsize += tbp->b_bufsize; 4321541Srgrimes } 43318737Sdyson 43418737Sdyson for(j=0;j<bp->b_npages;j++) { 43518737Sdyson if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 43618737Sdyson VM_PAGE_BITS_ALL) 43718737Sdyson bp->b_pages[j] = bogus_page; 43818737Sdyson } 43920054Sdyson if (bp->b_bufsize > bp->b_kvasize) 44037559Sbde panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 44137559Sbde bp->b_bufsize, bp->b_kvasize); 44220054Sdyson bp->b_kvasize = bp->b_bufsize; 44318737Sdyson 44410541Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 44510541Sdyson (vm_page_t *)bp->b_pages, bp->b_npages); 4465455Sdg return (bp); 4471541Srgrimes} 4481541Srgrimes 4491541Srgrimes/* 4501541Srgrimes * Cleanup after a clustered read or write. 4511541Srgrimes * This is complicated by the fact that any of the buffers might have 4521541Srgrimes * extra memory (if there were no empty buffer headers at allocbuf time) 4531541Srgrimes * that we will need to shift around. 4541541Srgrimes */ 4551541Srgrimesvoid 4561541Srgrimescluster_callback(bp) 4571541Srgrimes struct buf *bp; 4581541Srgrimes{ 45912404Sdyson struct buf *nbp, *tbp; 4601541Srgrimes int error = 0; 4611541Srgrimes 4621541Srgrimes /* 4631541Srgrimes * Must propogate errors to all the components. 4641541Srgrimes */ 4651541Srgrimes if (bp->b_flags & B_ERROR) 4661541Srgrimes error = bp->b_error; 4671541Srgrimes 46810541Sdyson pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 4691541Srgrimes /* 4701541Srgrimes * Move memory from the large cluster buffer into the component 4711541Srgrimes * buffers and mark IO as done on these. 4721541Srgrimes */ 47321002Sdyson for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 47412404Sdyson tbp; tbp = nbp) { 47521002Sdyson nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 4761541Srgrimes if (error) { 4771541Srgrimes tbp->b_flags |= B_ERROR; 4781541Srgrimes tbp->b_error = error; 47925135Sdfr } else 48025135Sdfr tbp->b_dirtyoff = tbp->b_dirtyend = 0; 4811541Srgrimes biodone(tbp); 4821541Srgrimes } 4835455Sdg relpbuf(bp); 4841541Srgrimes} 4851541Srgrimes 4861541Srgrimes/* 4871541Srgrimes * Do clustered write for FFS. 4881541Srgrimes * 4891541Srgrimes * Three cases: 4901541Srgrimes * 1. Write is not sequential (write asynchronously) 4911541Srgrimes * Write is sequential: 4921541Srgrimes * 2. beginning of cluster - begin cluster 4931541Srgrimes * 3. middle of a cluster - add to cluster 4941541Srgrimes * 4. end of a cluster - asynchronously write cluster 4951541Srgrimes */ 4961541Srgrimesvoid 4971541Srgrimescluster_write(bp, filesize) 4985455Sdg struct buf *bp; 4991541Srgrimes u_quad_t filesize; 5001541Srgrimes{ 5015455Sdg struct vnode *vp; 5025455Sdg daddr_t lbn; 5035455Sdg int maxclen, cursize; 5045455Sdg int lblocksize; 50512404Sdyson int async; 5061541Srgrimes 5075455Sdg vp = bp->b_vp; 50832724Sdyson if (vp->v_maxio == 0) 50932724Sdyson vp->v_maxio = DFLTPHYS; 51032286Sdyson if (vp->v_type == VREG) { 51132286Sdyson async = vp->v_mount->mnt_flag & MNT_ASYNC; 51232286Sdyson lblocksize = vp->v_mount->mnt_stat.f_iosize; 51332286Sdyson } else { 51432286Sdyson async = 0; 51532286Sdyson lblocksize = bp->b_bufsize; 51632286Sdyson } 5175455Sdg lbn = bp->b_lblkno; 5181541Srgrimes 51934694Sdyson#ifdef DIAGNOSTIC 52034694Sdyson if (bp->b_offset == NOOFFSET) 52134694Sdyson panic("cluster_write: no buffer offset"); 52234694Sdyson#endif 52334694Sdyson 5241541Srgrimes /* Initialize vnode to beginning of file. */ 5251541Srgrimes if (lbn == 0) 5261541Srgrimes vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 5271541Srgrimes 5285455Sdg if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 5295455Sdg (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 53032724Sdyson maxclen = vp->v_maxio / lblocksize - 1; 5311541Srgrimes if (vp->v_clen != 0) { 5321541Srgrimes /* 5331541Srgrimes * Next block is not sequential. 5348876Srgrimes * 5351541Srgrimes * If we are not writing at end of file, the process 5365455Sdg * seeked to another point in the file since its last 5375455Sdg * write, or we have reached our maximum cluster size, 5385455Sdg * then push the previous cluster. Otherwise try 5395455Sdg * reallocating to make it sequential. 5401541Srgrimes */ 5411541Srgrimes cursize = vp->v_lastw - vp->v_cstart + 1; 54212973Sbde#ifndef notyet_block_reallocation_enabled 54334611Sdyson if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 54412404Sdyson lbn != vp->v_lastw + 1 || 54512404Sdyson vp->v_clen <= cursize) { 54612404Sdyson if (!async) 54712404Sdyson cluster_wbuild(vp, lblocksize, 54812404Sdyson vp->v_cstart, cursize); 54912404Sdyson } 55012404Sdyson#else 55124484Sbde if ((lbn + 1) * lblocksize != filesize || 55210541Sdyson lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 55312404Sdyson if (!async) 55412404Sdyson cluster_wbuild(vp, lblocksize, 55512404Sdyson vp->v_cstart, cursize); 55610541Sdyson } else { 55710541Sdyson struct buf **bpp, **endbp; 55810541Sdyson struct cluster_save *buflist; 55910541Sdyson 56010541Sdyson buflist = cluster_collectbufs(vp, bp); 56110541Sdyson endbp = &buflist->bs_children 56210541Sdyson [buflist->bs_nchildren - 1]; 56310541Sdyson if (VOP_REALLOCBLKS(vp, buflist)) { 56410541Sdyson /* 56510541Sdyson * Failed, push the previous cluster. 56610541Sdyson */ 56710541Sdyson for (bpp = buflist->bs_children; 56810541Sdyson bpp < endbp; bpp++) 56910541Sdyson brelse(*bpp); 57010541Sdyson free(buflist, M_SEGMENT); 57112404Sdyson cluster_wbuild(vp, lblocksize, 57212404Sdyson vp->v_cstart, cursize); 57310541Sdyson } else { 57410541Sdyson /* 57510541Sdyson * Succeeded, keep building cluster. 57610541Sdyson */ 57710541Sdyson for (bpp = buflist->bs_children; 57810541Sdyson bpp <= endbp; bpp++) 57910541Sdyson bdwrite(*bpp); 58010541Sdyson free(buflist, M_SEGMENT); 58110541Sdyson vp->v_lastw = lbn; 58210541Sdyson vp->v_lasta = bp->b_blkno; 58310541Sdyson return; 58410541Sdyson } 58510541Sdyson } 58612973Sbde#endif /* notyet_block_reallocation_enabled */ 5871541Srgrimes } 5881541Srgrimes /* 5895455Sdg * Consider beginning a cluster. If at end of file, make 5905455Sdg * cluster as large as possible, otherwise find size of 5915455Sdg * existing cluster. 5921541Srgrimes */ 59332286Sdyson if ((vp->v_type == VREG) && 59434611Sdyson ((u_quad_t) bp->b_offset + lblocksize) != filesize && 5957613Sdg (bp->b_blkno == bp->b_lblkno) && 59610551Sdyson (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 59710541Sdyson bp->b_blkno == -1)) { 5981541Srgrimes bawrite(bp); 5991541Srgrimes vp->v_clen = 0; 6001541Srgrimes vp->v_lasta = bp->b_blkno; 6011541Srgrimes vp->v_cstart = lbn + 1; 6021541Srgrimes vp->v_lastw = lbn; 6031541Srgrimes return; 6041541Srgrimes } 6055455Sdg vp->v_clen = maxclen; 60612404Sdyson if (!async && maxclen == 0) { /* I/O not contiguous */ 6071541Srgrimes vp->v_cstart = lbn + 1; 60813490Sdyson bawrite(bp); 6095455Sdg } else { /* Wait for rest of cluster */ 6101541Srgrimes vp->v_cstart = lbn; 6115455Sdg bdwrite(bp); 6121541Srgrimes } 6131541Srgrimes } else if (lbn == vp->v_cstart + vp->v_clen) { 6141541Srgrimes /* 6151541Srgrimes * At end of cluster, write it out. 6161541Srgrimes */ 61712404Sdyson bdwrite(bp); 61813490Sdyson cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 6191541Srgrimes vp->v_clen = 0; 6201541Srgrimes vp->v_cstart = lbn + 1; 6211541Srgrimes } else 6221541Srgrimes /* 6235455Sdg * In the middle of a cluster, so just delay the I/O for now. 6241541Srgrimes */ 6251541Srgrimes bdwrite(bp); 6261541Srgrimes vp->v_lastw = lbn; 6271541Srgrimes vp->v_lasta = bp->b_blkno; 6281541Srgrimes} 6291541Srgrimes 6301541Srgrimes 6311541Srgrimes/* 6321541Srgrimes * This is an awful lot like cluster_rbuild...wish they could be combined. 6331541Srgrimes * The last lbn argument is the current block on which I/O is being 6341541Srgrimes * performed. Check to see that it doesn't fall in the middle of 6351541Srgrimes * the current block (if last_bp == NULL). 6361541Srgrimes */ 63712767Sdysonint 63812404Sdysoncluster_wbuild(vp, size, start_lbn, len) 6391541Srgrimes struct vnode *vp; 6401541Srgrimes long size; 6411541Srgrimes daddr_t start_lbn; 6421541Srgrimes int len; 6431541Srgrimes{ 64412404Sdyson struct buf *bp, *tbp; 6455455Sdg int i, j, s; 64612767Sdyson int totalwritten = 0; 64712404Sdyson int dbsize = btodb(size); 64835595Sbde 64935595Sbde if (vp->v_maxio == 0) 65035595Sbde vp->v_maxio = DFLTPHYS; 65112767Sdyson while (len > 0) { 65212767Sdyson s = splbio(); 65332286Sdyson if (((tbp = gbincore(vp, start_lbn)) == NULL) || 65434630Sjulian ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { 65512767Sdyson ++start_lbn; 65612767Sdyson --len; 65712767Sdyson splx(s); 65812767Sdyson continue; 65912767Sdyson } 66012767Sdyson bremfree(tbp); 66112767Sdyson tbp->b_flags |= B_BUSY; 66212767Sdyson tbp->b_flags &= ~B_DONE; 66312767Sdyson splx(s); 6641541Srgrimes 6651541Srgrimes /* 6665455Sdg * Extra memory in the buffer, punt on this buffer. XXX we could 6675455Sdg * handle this in most cases, but we would have to push the extra 6685455Sdg * memory down to after our max possible cluster size and then 6695455Sdg * potentially pull it back up if the cluster was terminated 6705455Sdg * prematurely--too much hassle. 6711541Srgrimes */ 67214319Sdyson if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 67334630Sjulian (tbp->b_bcount != tbp->b_bufsize) || 67434630Sjulian (tbp->b_bcount != size) || 67534630Sjulian (len == 1) || 67634630Sjulian ((bp = trypbuf()) == NULL)) { 67712767Sdyson totalwritten += tbp->b_bufsize; 67812767Sdyson bawrite(tbp); 67912767Sdyson ++start_lbn; 68012767Sdyson --len; 68112767Sdyson continue; 68212767Sdyson } 68312404Sdyson 68434630Sjulian /* 68534630Sjulian * We got a pbuf to make the cluster in. 68634630Sjulian * so initialise it. 68734630Sjulian */ 68812767Sdyson TAILQ_INIT(&bp->b_cluster.cluster_head); 68912767Sdyson bp->b_bcount = 0; 69012767Sdyson bp->b_bufsize = 0; 69112767Sdyson bp->b_npages = 0; 69217304Sdyson if (tbp->b_wcred != NOCRED) { 69317304Sdyson bp->b_wcred = tbp->b_wcred; 69417304Sdyson crhold(bp->b_wcred); 69517304Sdyson } 6961541Srgrimes 69712767Sdyson bp->b_blkno = tbp->b_blkno; 69812767Sdyson bp->b_lblkno = tbp->b_lblkno; 69934611Sdyson bp->b_offset = tbp->b_offset; 70037467Sbde bp->b_data = (char *)((vm_offset_t)bp->b_data | 70137467Sbde ((vm_offset_t)tbp->b_data & PAGE_MASK)); 70232286Sdyson bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | 70334694Sdyson (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 70412767Sdyson bp->b_iodone = cluster_callback; 70512767Sdyson pbgetvp(vp, bp); 70634630Sjulian /* 70734630Sjulian * From this location in the file, scan forward to see 70834630Sjulian * if there are buffers with adjacent data that need to 70934630Sjulian * be written as well. 71034630Sjulian */ 71112767Sdyson for (i = 0; i < len; ++i, ++start_lbn) { 71234630Sjulian if (i != 0) { /* If not the first buffer */ 71312767Sdyson s = splbio(); 71434630Sjulian /* 71534630Sjulian * If the adjacent data is not even in core it 71634630Sjulian * can't need to be written. 71734630Sjulian */ 71812767Sdyson if ((tbp = gbincore(vp, start_lbn)) == NULL) { 71912767Sdyson splx(s); 72012767Sdyson break; 72112767Sdyson } 7221541Srgrimes 72334630Sjulian /* 72434630Sjulian * If it IS in core, but has different 72534630Sjulian * characteristics, don't cluster with it. 72634630Sjulian */ 72734630Sjulian if ((tbp->b_flags & 72834694Sdyson (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY | 72934694Sdyson B_DELWRI | B_NEEDCOMMIT)) 73034694Sdyson != (B_DELWRI | B_CLUSTEROK | 73134694Sdyson (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) { 73212767Sdyson splx(s); 73312767Sdyson break; 73412767Sdyson } 73512767Sdyson 73617304Sdyson if (tbp->b_wcred != bp->b_wcred) { 73717304Sdyson splx(s); 73817304Sdyson break; 73917304Sdyson } 74017304Sdyson 74134630Sjulian /* 74234630Sjulian * Check that the combined cluster 74334630Sjulian * would make sense with regard to pages 74434630Sjulian * and would not be too large 74534630Sjulian */ 74612767Sdyson if ((tbp->b_bcount != size) || 74734630Sjulian ((bp->b_blkno + (dbsize * i)) != 74834694Sdyson tbp->b_blkno) || 74934630Sjulian ((tbp->b_npages + bp->b_npages) > 75034630Sjulian (vp->v_maxio / PAGE_SIZE))) { 75112767Sdyson splx(s); 75212767Sdyson break; 75312767Sdyson } 75434630Sjulian /* 75534630Sjulian * Ok, it's passed all the tests, 75634630Sjulian * so remove it from the free list 75734630Sjulian * and mark it busy. We will use it. 75834630Sjulian */ 75912767Sdyson bremfree(tbp); 76012767Sdyson tbp->b_flags |= B_BUSY; 76112767Sdyson tbp->b_flags &= ~B_DONE; 76212404Sdyson splx(s); 76334630Sjulian } /* end of code for non-first buffers only */ 76434266Sjulian /* check for latent dependencies to be handled */ 76534266Sjulian if ((LIST_FIRST(&tbp->b_dep)) != NULL && 76634266Sjulian bioops.io_start) 76734266Sjulian (*bioops.io_start)(tbp); 76834630Sjulian /* 76934630Sjulian * If the IO is via the VM then we do some 77034630Sjulian * special VM hackery. (yuck) 77134630Sjulian */ 77213490Sdyson if (tbp->b_flags & B_VMIO) { 77332937Sdyson vm_page_t m; 77432937Sdyson 77534630Sjulian if (i != 0) { /* if not first buffer */ 77632937Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 77732937Sdyson m = tbp->b_pages[j]; 77832937Sdyson if (m->flags & PG_BUSY) 77932937Sdyson goto finishcluster; 78032937Sdyson } 78132937Sdyson } 78232937Sdyson 78313490Sdyson for (j = 0; j < tbp->b_npages; j += 1) { 78413490Sdyson m = tbp->b_pages[j]; 78538799Sdfr vm_page_io_start(m); 78638517Sdfr vm_object_pip_add(m->object, 1); 78713490Sdyson if ((bp->b_npages == 0) || 78834630Sjulian (bp->b_pages[bp->b_npages - 1] != m)) { 78913490Sdyson bp->b_pages[bp->b_npages] = m; 79013490Sdyson bp->b_npages++; 79113490Sdyson } 79212767Sdyson } 79312767Sdyson } 79412767Sdyson bp->b_bcount += size; 79512767Sdyson bp->b_bufsize += size; 7961541Srgrimes 79738299Sdfr s = splbio(); 79826664Sdyson --numdirtybuffers; 79912767Sdyson tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 80012767Sdyson tbp->b_flags |= B_ASYNC; 80112767Sdyson reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 80212767Sdyson ++tbp->b_vp->v_numoutput; 80338299Sdfr splx(s); 80412767Sdyson TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 80512767Sdyson tbp, b_cluster.cluster_entry); 8061541Srgrimes } 80732937Sdyson finishcluster: 80812767Sdyson pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 80912767Sdyson (vm_page_t *) bp->b_pages, bp->b_npages); 81020054Sdyson if (bp->b_bufsize > bp->b_kvasize) 81137559Sbde panic( 81237559Sbde "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 81337559Sbde bp->b_bufsize, bp->b_kvasize); 81420054Sdyson bp->b_kvasize = bp->b_bufsize; 81512767Sdyson totalwritten += bp->b_bufsize; 81617304Sdyson bp->b_dirtyoff = 0; 81717304Sdyson bp->b_dirtyend = bp->b_bufsize; 81812767Sdyson bawrite(bp); 8191541Srgrimes 82012767Sdyson len -= i; 8211541Srgrimes } 82212767Sdyson return totalwritten; 8231541Srgrimes} 8241541Srgrimes 82512973Sbde#ifdef notyet_block_reallocation_enabled 8261541Srgrimes/* 8271541Srgrimes * Collect together all the buffers in a cluster. 8281541Srgrimes * Plus add one additional buffer. 8291541Srgrimes */ 83012973Sbdestatic struct cluster_save * 8311541Srgrimescluster_collectbufs(vp, last_bp) 8321541Srgrimes struct vnode *vp; 8331541Srgrimes struct buf *last_bp; 8341541Srgrimes{ 8351541Srgrimes struct cluster_save *buflist; 8365455Sdg daddr_t lbn; 8371541Srgrimes int i, len; 8381541Srgrimes 8391541Srgrimes len = vp->v_lastw - vp->v_cstart + 1; 8401541Srgrimes buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 8411541Srgrimes M_SEGMENT, M_WAITOK); 8421541Srgrimes buflist->bs_nchildren = 0; 8435455Sdg buflist->bs_children = (struct buf **) (buflist + 1); 8441541Srgrimes for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 8455455Sdg (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, 8465455Sdg &buflist->bs_children[i]); 8471541Srgrimes buflist->bs_children[i] = last_bp; 8481541Srgrimes buflist->bs_nchildren = i + 1; 8491541Srgrimes return (buflist); 8501541Srgrimes} 85112973Sbde#endif /* notyet_block_reallocation_enabled */ 852