vfs_cluster.c revision 31016
1270096Strasz/*- 2270096Strasz * Copyright (c) 1993 3270096Strasz * The Regents of the University of California. All rights reserved. 4270096Strasz * Modifications/enhancements: 5270096Strasz * Copyright (c) 1995 John S. Dyson. All rights reserved. 6270096Strasz * 7270096Strasz * Redistribution and use in source and binary forms, with or without 8270096Strasz * modification, are permitted provided that the following conditions 9270096Strasz * are met: 10270096Strasz * 1. Redistributions of source code must retain the above copyright 11270096Strasz * notice, this list of conditions and the following disclaimer. 12270096Strasz * 2. Redistributions in binary form must reproduce the above copyright 13270096Strasz * notice, this list of conditions and the following disclaimer in the 14270096Strasz * documentation and/or other materials provided with the distribution. 15270096Strasz * 3. All advertising materials mentioning features or use of this software 16270096Strasz * must display the following acknowledgement: 17270096Strasz * This product includes software developed by the University of 18270096Strasz * California, Berkeley and its contributors. 19270096Strasz * 4. Neither the name of the University nor the names of its contributors 20270096Strasz * may be used to endorse or promote products derived from this software 21270096Strasz * without specific prior written permission. 22270096Strasz * 23270096Strasz * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24270096Strasz * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25270096Strasz * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26270096Strasz * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27270096Strasz * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28270096Strasz * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29270096Strasz * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30270096Strasz * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31270096Strasz * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32270096Strasz * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33270096Strasz * SUCH DAMAGE. 34270096Strasz * 35270096Strasz * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 36270096Strasz * $Id: vfs_cluster.c,v 1.48 1997/08/02 14:31:43 bde Exp $ 37270096Strasz */ 38270096Strasz 39270096Strasz#include <sys/param.h> 40270096Strasz#include <sys/systm.h> 41270096Strasz#include <sys/proc.h> 42270096Strasz#include <sys/buf.h> 43270096Strasz#include <sys/vnode.h> 44296718Strasz#include <sys/mount.h> 45270096Strasz#include <sys/resourcevar.h> 46272403Strasz#include <vm/vm.h> 47297236Strasz#include <vm/vm_prot.h> 48270096Strasz#include <vm/vm_object.h> 49270096Strasz#include <vm/vm_page.h> 50270096Strasz 51270096Strasz#if defined(CLUSTERDEBUG) 52270281Strasz#include <sys/sysctl.h> 53270096Strasz#include <sys/kernel.h> 54270096Straszstatic int rcluster= 0; 55270096StraszSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 56270096Strasz#endif 57270402Strasz 58270402Strasz#ifdef notyet_block_reallocation_enabled 59270096Straszstatic struct cluster_save * 60270096Strasz cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 61270096Strasz#endif 62270096Straszstatic struct buf * 63270096Strasz cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 64270096Strasz daddr_t blkno, long size, int run, struct buf *fbp)); 65270096Strasz 66270096Straszextern vm_page_t bogus_page; 67270096Strasz 68270096Strasz/* 69270096Strasz * Maximum number of blocks for read-ahead. 70270096Strasz */ 71270096Strasz#define MAXRA 32 72270096Strasz 73270096Strasz/* 74270096Strasz * This replaces bread. 75270096Strasz */ 76270096Straszint 77270096Straszcluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 78270096Strasz struct vnode *vp; 79270096Strasz u_quad_t filesize; 80270096Strasz daddr_t lblkno; 81270096Strasz long size; 82270096Strasz struct ucred *cred; 83270096Strasz long totread; 84270096Strasz int seqcount; 85270096Strasz struct buf **bpp; 86270096Strasz{ 87270096Strasz struct buf *bp, *rbp, *reqbp; 88270096Strasz daddr_t blkno, origblkno; 89270096Strasz int error, num_ra; 90270096Strasz int i; 91270096Strasz int maxra, racluster; 92270096Strasz long origtotread; 93270096Strasz 94270096Strasz error = 0; 95270096Strasz 96270096Strasz /* 97270096Strasz * Try to limit the amount of read-ahead by a few 98270096Strasz * ad-hoc parameters. This needs work!!! 99270096Strasz */ 100270096Strasz racluster = MAXPHYS/size; 101270096Strasz maxra = 2 * racluster + (totread / size); 102270096Strasz if (maxra > MAXRA) 103270096Strasz maxra = MAXRA; 104270096Strasz if (maxra > nbuf/8) 105270096Strasz maxra = nbuf/8; 106270096Strasz 107270096Strasz /* 108270096Strasz * get the requested block 109270096Strasz */ 110270096Strasz *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 111270096Strasz origblkno = lblkno; 112270096Strasz origtotread = totread; 113270096Strasz 114270096Strasz /* 115296718Strasz * if it is in the cache, then check to see if the reads have been 116296718Strasz * sequential. If they have, then try some read-ahead, otherwise 117270096Strasz * back-off on prospective read-aheads. 118270096Strasz */ 119270096Strasz if (bp->b_flags & B_CACHE) { 120270096Strasz if (!seqcount) { 121270096Strasz return 0; 122270096Strasz } else if ((bp->b_flags & B_RAM) == 0) { 123270096Strasz return 0; 124296718Strasz } else { 125270096Strasz int s; 126270096Strasz struct buf *tbp; 127270096Strasz bp->b_flags &= ~B_RAM; 128270096Strasz /* 129270096Strasz * We do the spl here so that there is no window 130270096Strasz * between the incore and the b_usecount increment 131270096Strasz * below. We opt to keep the spl out of the loop 132270096Strasz * for efficiency. 133270096Strasz */ 134270096Strasz s = splbio(); 135270096Strasz for(i=1;i<maxra;i++) { 136270096Strasz 137270096Strasz if (!(tbp = incore(vp, lblkno+i))) { 138270096Strasz break; 139270096Strasz } 140270096Strasz 141270096Strasz /* 142270096Strasz * Set another read-ahead mark so we know to check 143270096Strasz * again. 144270096Strasz */ 145270096Strasz if (((i % racluster) == (racluster - 1)) || 146270096Strasz (i == (maxra - 1))) 147270096Strasz tbp->b_flags |= B_RAM; 148270096Strasz 149270096Strasz#if 0 150270096Strasz if (tbp->b_usecount == 0) { 151270096Strasz /* 152270096Strasz * Make sure that the soon-to-be used readaheads 153270096Strasz * are still there. The getblk/bqrelse pair will 154270096Strasz * boost the priority of the buffer. 155270096Strasz */ 156270096Strasz tbp = getblk(vp, lblkno+i, size, 0, 0); 157270402Strasz bqrelse(tbp); 158270096Strasz } 159270096Strasz#endif 160270096Strasz } 161270096Strasz splx(s); 162270096Strasz if (i >= maxra) { 163270096Strasz return 0; 164270096Strasz } 165270096Strasz lblkno += i; 166270096Strasz } 167270096Strasz reqbp = bp = NULL; 168270096Strasz } else { 169270402Strasz u_quad_t firstread; 170270096Strasz firstread = (u_quad_t) lblkno * size; 171270096Strasz if (firstread + totread > filesize) 172270096Strasz totread = filesize - firstread; 173270096Strasz if (totread > size) { 174270096Strasz int nblks = 0; 175270096Strasz int ncontigafter; 176270096Strasz while (totread > 0) { 177270096Strasz nblks++; 178270096Strasz totread -= size; 179270096Strasz } 180270096Strasz if (nblks == 1) 181270096Strasz goto single_block_read; 182270096Strasz if (nblks > racluster) 183270096Strasz nblks = racluster; 184270096Strasz 185270096Strasz error = VOP_BMAP(vp, lblkno, NULL, 186270096Strasz &blkno, &ncontigafter, NULL); 187270096Strasz if (error) 188270096Strasz goto single_block_read; 189270096Strasz if (blkno == -1) 190270096Strasz goto single_block_read; 191270096Strasz if (ncontigafter == 0) 192270096Strasz goto single_block_read; 193270096Strasz if (ncontigafter + 1 < nblks) 194270096Strasz nblks = ncontigafter + 1; 195270096Strasz 196270096Strasz bp = cluster_rbuild(vp, filesize, lblkno, 197270096Strasz blkno, size, nblks, bp); 198270096Strasz lblkno += nblks; 199270096Strasz } else { 200270096Straszsingle_block_read: 201270096Strasz /* 202270096Strasz * if it isn't in the cache, then get a chunk from 203272512Strasz * disk if sequential, otherwise just get the block. 204270207Strasz */ 205270207Strasz bp->b_flags |= B_READ | B_RAM; 206270207Strasz lblkno += 1; 207270207Strasz } 208272512Strasz } 209270207Strasz 210270207Strasz /* 211270207Strasz * if we have been doing sequential I/O, then do some read-ahead 212270096Strasz */ 213270096Strasz rbp = NULL; 214270096Strasz /* if (seqcount && (lblkno < (origblkno + maxra))) { */ 215270096Strasz if (seqcount && (lblkno < (origblkno + seqcount))) { 216270096Strasz /* 217270096Strasz * we now build the read-ahead buffer if it is desirable. 218270096Strasz */ 219296715Strasz if (((u_quad_t)(lblkno + 1) * size) <= filesize && 220270096Strasz !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 221270096Strasz blkno != -1) { 222270096Strasz int nblksread; 223270096Strasz int ntoread = num_ra + 1; 224270096Strasz nblksread = (origtotread + size - 1) / size; 225270096Strasz if (seqcount < nblksread) 226270096Strasz seqcount = nblksread; 227270096Strasz if (seqcount < ntoread) 228270096Strasz ntoread = seqcount; 229270096Strasz if (num_ra) { 230270096Strasz rbp = cluster_rbuild(vp, filesize, lblkno, 231270207Strasz blkno, size, ntoread, NULL); 232270207Strasz } else { 233270207Strasz rbp = getblk(vp, lblkno, size, 0, 0); 234270207Strasz rbp->b_flags |= B_READ | B_ASYNC | B_RAM; 235270207Strasz rbp->b_blkno = blkno; 236270096Strasz } 237270207Strasz } 238272512Strasz } 239270096Strasz 240270207Strasz /* 241270096Strasz * handle the synchronous read 242270207Strasz */ 243270096Strasz if (bp) { 244270096Strasz if (bp->b_flags & (B_DONE | B_DELWRI)) { 245270096Strasz panic("cluster_read: DONE bp"); 246270096Strasz } else { 247270096Strasz#if defined(CLUSTERDEBUG) 248270096Strasz if (rcluster) 249270096Strasz printf("S(%d,%d,%d) ", 250270096Strasz bp->b_lblkno, bp->b_bcount, seqcount); 251270096Strasz#endif 252270096Strasz if ((bp->b_flags & B_CLUSTER) == 0) 253270096Strasz vfs_busy_pages(bp, 0); 254270096Strasz error = VOP_STRATEGY(bp); 255270096Strasz curproc->p_stats->p_ru.ru_inblock++; 256270096Strasz } 257270096Strasz } 258270096Strasz /* 259270096Strasz * and if we have read-aheads, do them too 260270096Strasz */ 261270096Strasz if (rbp) { 262270096Strasz if (error) { 263296715Strasz rbp->b_flags &= ~(B_ASYNC | B_READ); 264296715Strasz brelse(rbp); 265296715Strasz } else if (rbp->b_flags & B_CACHE) { 266270096Strasz rbp->b_flags &= ~(B_ASYNC | B_READ); 267296715Strasz bqrelse(rbp); 268296715Strasz } else { 269270096Strasz#if defined(CLUSTERDEBUG) 270270096Strasz if (rcluster) { 271270096Strasz if (bp) 272272470Strasz printf("A+(%d,%d,%d,%d) ", 273270096Strasz rbp->b_lblkno, rbp->b_bcount, 274270096Strasz rbp->b_lblkno - origblkno, 275270096Strasz seqcount); 276272470Strasz else 277270096Strasz printf("A(%d,%d,%d,%d) ", 278270096Strasz rbp->b_lblkno, rbp->b_bcount, 279270096Strasz rbp->b_lblkno - origblkno, 280272470Strasz seqcount); 281270096Strasz } 282270096Strasz#endif 283270096Strasz 284270096Strasz if ((rbp->b_flags & B_CLUSTER) == 0) 285270096Strasz vfs_busy_pages(rbp, 0); 286270096Strasz (void) VOP_STRATEGY(rbp); 287272470Strasz curproc->p_stats->p_ru.ru_inblock++; 288270096Strasz } 289272512Strasz } 290270096Strasz if (reqbp) 291270096Strasz return (biowait(reqbp)); 292270096Strasz else 293270096Strasz return (error); 294270096Strasz} 295270096Strasz 296270096Strasz/* 297270096Strasz * If blocks are contiguous on disk, use this to provide clustered 298270096Strasz * read ahead. We will read as many blocks as possible sequentially 299270096Strasz * and then parcel them up into logical blocks in the buffer hash table. 300270096Strasz */ 301270096Straszstatic struct buf * 302270096Straszcluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 303270096Strasz struct vnode *vp; 304270096Strasz u_quad_t filesize; 305270096Strasz daddr_t lbn; 306270096Strasz daddr_t blkno; 307270096Strasz long size; 308270096Strasz int run; 309270096Strasz struct buf *fbp; 310270096Strasz{ 311270096Strasz struct buf *bp, *tbp; 312270096Strasz daddr_t bn; 313270096Strasz int i, inc, j; 314270096Strasz 315270096Strasz#ifdef DIAGNOSTIC 316270096Strasz if (size != vp->v_mount->mnt_stat.f_iosize) 317270096Strasz panic("cluster_rbuild: size %d != filesize %d\n", 318270096Strasz size, vp->v_mount->mnt_stat.f_iosize); 319270096Strasz#endif 320272470Strasz /* 321270096Strasz * avoid a division 322270096Strasz */ 323270096Strasz while ((u_quad_t) size * (lbn + run) > filesize) { 324272470Strasz --run; 325270096Strasz } 326270096Strasz 327272470Strasz if (fbp) { 328270096Strasz tbp = fbp; 329272512Strasz tbp->b_flags |= B_READ; 330270096Strasz } else { 331270096Strasz tbp = getblk(vp, lbn, size, 0, 0); 332270096Strasz if (tbp->b_flags & B_CACHE) 333270096Strasz return tbp; 334296798Strasz tbp->b_flags |= B_ASYNC | B_READ | B_RAM; 335296798Strasz } 336296798Strasz 337270096Strasz tbp->b_blkno = blkno; 338296798Strasz if( (tbp->b_flags & B_MALLOC) || 339296798Strasz ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 340270096Strasz return tbp; 341270096Strasz 342296798Strasz bp = trypbuf(); 343296798Strasz if (bp == 0) 344270096Strasz return tbp; 345296798Strasz 346296798Strasz (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; 347296798Strasz bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 348296798Strasz bp->b_iodone = cluster_callback; 349296798Strasz bp->b_blkno = blkno; 350296798Strasz bp->b_lblkno = lbn; 351296798Strasz pbgetvp(vp, bp); 352296798Strasz 353296798Strasz TAILQ_INIT(&bp->b_cluster.cluster_head); 354296798Strasz 355296798Strasz bp->b_bcount = 0; 356296798Strasz bp->b_bufsize = 0; 357296798Strasz bp->b_npages = 0; 358296798Strasz 359296798Strasz inc = btodb(size); 360296798Strasz for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 361270096Strasz if (i != 0) { 362296798Strasz if ((bp->b_npages * PAGE_SIZE) + 363296798Strasz round_page(size) > MAXPHYS) 364296798Strasz break; 365296798Strasz 366270096Strasz if (incore(vp, lbn + i)) 367270096Strasz break; 368270096Strasz 369270096Strasz tbp = getblk(vp, lbn + i, size, 0, 0); 370296798Strasz 371296798Strasz if ((tbp->b_flags & B_CACHE) || 372296798Strasz (tbp->b_flags & B_VMIO) == 0) { 373296798Strasz bqrelse(tbp); 374296798Strasz break; 375296937Strasz } 376296798Strasz 377296798Strasz for (j=0;j<tbp->b_npages;j++) { 378296798Strasz if (tbp->b_pages[j]->valid) { 379296798Strasz break; 380270096Strasz } 381270096Strasz } 382270096Strasz 383270096Strasz if (j != tbp->b_npages) { 384270096Strasz /* 385270096Strasz * force buffer to be re-constituted later 386270096Strasz */ 387296798Strasz tbp->b_flags |= B_RELBUF; 388296798Strasz brelse(tbp); 389296798Strasz break; 390270096Strasz } 391270096Strasz 392270096Strasz if ((fbp && (i == 1)) || (i == (run - 1))) 393270096Strasz tbp->b_flags |= B_RAM; 394270096Strasz tbp->b_flags |= B_READ | B_ASYNC; 395296798Strasz if (tbp->b_blkno == tbp->b_lblkno) { 396270096Strasz tbp->b_blkno = bn; 397270096Strasz } else if (tbp->b_blkno != bn) { 398270096Strasz brelse(tbp); 399270096Strasz break; 400270096Strasz } 401270096Strasz } 402270096Strasz TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 403270096Strasz tbp, b_cluster.cluster_entry); 404270096Strasz for (j = 0; j < tbp->b_npages; j += 1) { 405270096Strasz vm_page_t m; 406270096Strasz m = tbp->b_pages[j]; 407270096Strasz ++m->busy; 408270096Strasz ++m->object->paging_in_progress; 409270096Strasz if ((bp->b_npages == 0) || 410270096Strasz (bp->b_pages[bp->b_npages-1] != m)) { 411270096Strasz bp->b_pages[bp->b_npages] = m; 412270096Strasz bp->b_npages++; 413296798Strasz } 414270096Strasz if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 415270096Strasz tbp->b_pages[j] = bogus_page; 416270096Strasz } 417296798Strasz bp->b_bcount += tbp->b_bcount; 418270096Strasz bp->b_bufsize += tbp->b_bufsize; 419296798Strasz } 420296798Strasz 421296798Strasz for(j=0;j<bp->b_npages;j++) { 422296798Strasz if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 423296798Strasz VM_PAGE_BITS_ALL) 424296798Strasz bp->b_pages[j] = bogus_page; 425296798Strasz } 426270096Strasz if (bp->b_bufsize > bp->b_kvasize) 427296798Strasz panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n", 428270096Strasz bp->b_bufsize, bp->b_kvasize); 429296798Strasz bp->b_kvasize = bp->b_bufsize; 430270096Strasz 431296798Strasz pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 432296798Strasz (vm_page_t *)bp->b_pages, bp->b_npages); 433296798Strasz return (bp); 434296798Strasz} 435296798Strasz 436296798Strasz/* 437270096Strasz * Cleanup after a clustered read or write. 438296798Strasz * This is complicated by the fact that any of the buffers might have 439296798Strasz * extra memory (if there were no empty buffer headers at allocbuf time) 440270096Strasz * that we will need to shift around. 441270096Strasz */ 442296798Straszvoid 443270096Straszcluster_callback(bp) 444270096Strasz struct buf *bp; 445296798Strasz{ 446270096Strasz struct buf *nbp, *tbp; 447270096Strasz int error = 0; 448296798Strasz 449296798Strasz /* 450296798Strasz * Must propogate errors to all the components. 451296798Strasz */ 452296798Strasz if (bp->b_flags & B_ERROR) 453272470Strasz error = bp->b_error; 454297236Strasz 455296798Strasz pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 456296798Strasz /* 457296798Strasz * Move memory from the large cluster buffer into the component 458296798Strasz * buffers and mark IO as done on these. 459296798Strasz */ 460296798Strasz for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 461296798Strasz tbp; tbp = nbp) { 462270096Strasz nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 463270096Strasz if (error) { 464270096Strasz tbp->b_flags |= B_ERROR; 465296798Strasz tbp->b_error = error; 466270096Strasz } else 467296798Strasz tbp->b_dirtyoff = tbp->b_dirtyend = 0; 468296798Strasz biodone(tbp); 469296798Strasz } 470296798Strasz relpbuf(bp); 471270096Strasz} 472270096Strasz 473296798Strasz/* 474296798Strasz * Do clustered write for FFS. 475270096Strasz * 476272470Strasz * Three cases: 477296798Strasz * 1. Write is not sequential (write asynchronously) 478270096Strasz * Write is sequential: 479270096Strasz * 2. beginning of cluster - begin cluster 480296798Strasz * 3. middle of a cluster - add to cluster 481270096Strasz * 4. end of a cluster - asynchronously write cluster 482296798Strasz */ 483296798Straszvoid 484296798Straszcluster_write(bp, filesize) 485270096Strasz struct buf *bp; 486296798Strasz u_quad_t filesize; 487296798Strasz{ 488296798Strasz struct vnode *vp; 489296798Strasz daddr_t lbn; 490296798Strasz int maxclen, cursize; 491296798Strasz int lblocksize; 492296798Strasz int async; 493296798Strasz 494296798Strasz vp = bp->b_vp; 495296798Strasz async = vp->v_mount->mnt_flag & MNT_ASYNC; 496296798Strasz lblocksize = vp->v_mount->mnt_stat.f_iosize; 497296798Strasz lbn = bp->b_lblkno; 498296798Strasz 499296798Strasz /* Initialize vnode to beginning of file. */ 500296798Strasz if (lbn == 0) 501270096Strasz vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 502270096Strasz 503270096Strasz if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 504270096Strasz (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 505270096Strasz maxclen = MAXPHYS / lblocksize - 1; 506272836Strasz if (vp->v_clen != 0) { 507272836Strasz /* 508270096Strasz * Next block is not sequential. 509270096Strasz * 510270096Strasz * If we are not writing at end of file, the process 511270096Strasz * seeked to another point in the file since its last 512270096Strasz * write, or we have reached our maximum cluster size, 513270096Strasz * then push the previous cluster. Otherwise try 514270096Strasz * reallocating to make it sequential. 515270096Strasz */ 516270096Strasz cursize = vp->v_lastw - vp->v_cstart + 1; 517270096Strasz#ifndef notyet_block_reallocation_enabled 518270096Strasz if (((u_quad_t)(lbn + 1) * lblocksize) != filesize || 519270096Strasz lbn != vp->v_lastw + 1 || 520270096Strasz vp->v_clen <= cursize) { 521270096Strasz if (!async) 522270096Strasz cluster_wbuild(vp, lblocksize, 523270096Strasz vp->v_cstart, cursize); 524270096Strasz } 525270096Strasz#else 526270096Strasz if ((lbn + 1) * lblocksize != filesize || 527270096Strasz lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 528270096Strasz if (!async) 529270096Strasz cluster_wbuild(vp, lblocksize, 530270096Strasz vp->v_cstart, cursize); 531270096Strasz } else { 532270096Strasz struct buf **bpp, **endbp; 533270096Strasz struct cluster_save *buflist; 534270096Strasz 535270096Strasz buflist = cluster_collectbufs(vp, bp); 536270096Strasz endbp = &buflist->bs_children 537270096Strasz [buflist->bs_nchildren - 1]; 538270096Strasz if (VOP_REALLOCBLKS(vp, buflist)) { 539270096Strasz /* 540270096Strasz * Failed, push the previous cluster. 541270096Strasz */ 542270096Strasz for (bpp = buflist->bs_children; 543270096Strasz bpp < endbp; bpp++) 544270096Strasz brelse(*bpp); 545270096Strasz free(buflist, M_SEGMENT); 546270096Strasz cluster_wbuild(vp, lblocksize, 547270096Strasz vp->v_cstart, cursize); 548270096Strasz } else { 549270096Strasz /* 550270096Strasz * Succeeded, keep building cluster. 551272931Strasz */ 552272470Strasz for (bpp = buflist->bs_children; 553270096Strasz bpp <= endbp; bpp++) 554272931Strasz bdwrite(*bpp); 555272931Strasz free(buflist, M_SEGMENT); 556272931Strasz vp->v_lastw = lbn; 557272931Strasz vp->v_lasta = bp->b_blkno; 558270096Strasz return; 559270096Strasz } 560270096Strasz } 561270096Strasz#endif /* notyet_block_reallocation_enabled */ 562270096Strasz } 563270096Strasz /* 564270096Strasz * Consider beginning a cluster. If at end of file, make 565270096Strasz * cluster as large as possible, otherwise find size of 566270096Strasz * existing cluster. 567270096Strasz */ 568270096Strasz if (((u_quad_t) (lbn + 1) * lblocksize) != filesize && 569270096Strasz (bp->b_blkno == bp->b_lblkno) && 570270096Strasz (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 571270096Strasz bp->b_blkno == -1)) { 572270096Strasz bawrite(bp); 573270096Strasz vp->v_clen = 0; 574270096Strasz vp->v_lasta = bp->b_blkno; 575270096Strasz vp->v_cstart = lbn + 1; 576270096Strasz vp->v_lastw = lbn; 577270096Strasz return; 578270096Strasz } 579297236Strasz vp->v_clen = maxclen; 580297236Strasz if (!async && maxclen == 0) { /* I/O not contiguous */ 581270096Strasz vp->v_cstart = lbn + 1; 582270096Strasz bawrite(bp); 583270096Strasz } else { /* Wait for rest of cluster */ 584270096Strasz vp->v_cstart = lbn; 585270096Strasz bdwrite(bp); 586270096Strasz } 587270096Strasz } else if (lbn == vp->v_cstart + vp->v_clen) { 588270096Strasz /* 589270096Strasz * At end of cluster, write it out. 590297236Strasz */ 591297236Strasz bdwrite(bp); 592270096Strasz cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 593270096Strasz vp->v_clen = 0; 594270096Strasz vp->v_cstart = lbn + 1; 595297236Strasz } else 596297236Strasz /* 597297236Strasz * In the middle of a cluster, so just delay the I/O for now. 598297236Strasz */ 599270096Strasz bdwrite(bp); 600297236Strasz vp->v_lastw = lbn; 601297236Strasz vp->v_lasta = bp->b_blkno; 602297236Strasz} 603270096Strasz 604270096Strasz 605297236Strasz/* 606297236Strasz * This is an awful lot like cluster_rbuild...wish they could be combined. 607270096Strasz * The last lbn argument is the current block on which I/O is being 608270096Strasz * performed. Check to see that it doesn't fall in the middle of 609297236Strasz * the current block (if last_bp == NULL). 610297236Strasz */ 611297236Straszint 612270096Straszcluster_wbuild(vp, size, start_lbn, len) 613270096Strasz struct vnode *vp; 614270096Strasz long size; 615270096Strasz daddr_t start_lbn; 616270096Strasz int len; 617270096Strasz{ 618270096Strasz struct buf *bp, *tbp; 619272470Strasz int i, j, s; 620297236Strasz int totalwritten = 0; 621270096Strasz int dbsize = btodb(size); 622270096Strasz while (len > 0) { 623270096Strasz s = splbio(); 624270096Strasz if ( ((tbp = gbincore(vp, start_lbn)) == NULL) || 625270096Strasz ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { 626297236Strasz ++start_lbn; 627270096Strasz --len; 628270096Strasz splx(s); 629270096Strasz continue; 630270096Strasz } 631270096Strasz bremfree(tbp); 632270096Strasz tbp->b_flags |= B_BUSY; 633272512Strasz tbp->b_flags &= ~B_DONE; 634272512Strasz splx(s); 635270096Strasz 636270096Strasz /* 637270096Strasz * Extra memory in the buffer, punt on this buffer. XXX we could 638270096Strasz * handle this in most cases, but we would have to push the extra 639270096Strasz * memory down to after our max possible cluster size and then 640270096Strasz * potentially pull it back up if the cluster was terminated 641270096Strasz * prematurely--too much hassle. 642270096Strasz */ 643270096Strasz if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 644270096Strasz (tbp->b_bcount != tbp->b_bufsize) || 645272512Strasz (tbp->b_bcount != size) || 646270096Strasz len == 1) { 647270096Strasz totalwritten += tbp->b_bufsize; 648270096Strasz bawrite(tbp); 649270096Strasz ++start_lbn; 650270096Strasz --len; 651270096Strasz continue; 652270096Strasz } 653270096Strasz 654270096Strasz bp = trypbuf(); 655270096Strasz if (bp == NULL) { 656270096Strasz totalwritten += tbp->b_bufsize; 657270096Strasz bawrite(tbp); 658270096Strasz ++start_lbn; 659270096Strasz --len; 660270096Strasz continue; 661270096Strasz } 662270096Strasz 663270096Strasz TAILQ_INIT(&bp->b_cluster.cluster_head); 664270096Strasz bp->b_bcount = 0; 665270096Strasz bp->b_bufsize = 0; 666270096Strasz bp->b_npages = 0; 667270096Strasz if (tbp->b_wcred != NOCRED) { 668270096Strasz bp->b_wcred = tbp->b_wcred; 669270096Strasz crhold(bp->b_wcred); 670270096Strasz } 671270096Strasz 672270096Strasz bp->b_blkno = tbp->b_blkno; 673270096Strasz bp->b_lblkno = tbp->b_lblkno; 674270096Strasz (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; 675270096Strasz bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT)); 676270096Strasz bp->b_iodone = cluster_callback; 677270096Strasz pbgetvp(vp, bp); 678270096Strasz 679270096Strasz for (i = 0; i < len; ++i, ++start_lbn) { 680270096Strasz if (i != 0) { 681270096Strasz s = splbio(); 682270096Strasz if ((tbp = gbincore(vp, start_lbn)) == NULL) { 683270096Strasz splx(s); 684270096Strasz break; 685272512Strasz } 686272512Strasz 687270096Strasz if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) { 688270096Strasz splx(s); 689300047Strasz break; 690270096Strasz } 691270096Strasz 692270096Strasz if (tbp->b_wcred != bp->b_wcred) { 693270096Strasz splx(s); 694270096Strasz break; 695270096Strasz } 696270096Strasz 697270096Strasz if ((tbp->b_bcount != size) || 698270096Strasz ((bp->b_blkno + dbsize * i) != tbp->b_blkno) || 699270096Strasz ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) { 700270096Strasz splx(s); 701270096Strasz break; 702 } 703 bremfree(tbp); 704 tbp->b_flags |= B_BUSY; 705 tbp->b_flags &= ~B_DONE; 706 splx(s); 707 } 708 if (tbp->b_flags & B_VMIO) { 709 for (j = 0; j < tbp->b_npages; j += 1) { 710 vm_page_t m; 711 m = tbp->b_pages[j]; 712 ++m->busy; 713 ++m->object->paging_in_progress; 714 if ((bp->b_npages == 0) || 715 (bp->b_pages[bp->b_npages - 1] != m)) { 716 bp->b_pages[bp->b_npages] = m; 717 bp->b_npages++; 718 } 719 } 720 } 721 bp->b_bcount += size; 722 bp->b_bufsize += size; 723 724 --numdirtybuffers; 725 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 726 tbp->b_flags |= B_ASYNC; 727 s = splbio(); 728 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 729 ++tbp->b_vp->v_numoutput; 730 splx(s); 731 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 732 tbp, b_cluster.cluster_entry); 733 } 734 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 735 (vm_page_t *) bp->b_pages, bp->b_npages); 736 if (bp->b_bufsize > bp->b_kvasize) 737 panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n", 738 bp->b_bufsize, bp->b_kvasize); 739 bp->b_kvasize = bp->b_bufsize; 740 totalwritten += bp->b_bufsize; 741 bp->b_dirtyoff = 0; 742 bp->b_dirtyend = bp->b_bufsize; 743 bawrite(bp); 744 745 len -= i; 746 } 747 return totalwritten; 748} 749 750#ifdef notyet_block_reallocation_enabled 751/* 752 * Collect together all the buffers in a cluster. 753 * Plus add one additional buffer. 754 */ 755static struct cluster_save * 756cluster_collectbufs(vp, last_bp) 757 struct vnode *vp; 758 struct buf *last_bp; 759{ 760 struct cluster_save *buflist; 761 daddr_t lbn; 762 int i, len; 763 764 len = vp->v_lastw - vp->v_cstart + 1; 765 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 766 M_SEGMENT, M_WAITOK); 767 buflist->bs_nchildren = 0; 768 buflist->bs_children = (struct buf **) (buflist + 1); 769 for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 770 (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, 771 &buflist->bs_children[i]); 772 buflist->bs_children[i] = last_bp; 773 buflist->bs_nchildren = i + 1; 774 return (buflist); 775} 776#endif /* notyet_block_reallocation_enabled */ 777