vfs_cluster.c revision 68868
1218792Snp/*- 2218792Snp * Copyright (c) 1993 3218792Snp * The Regents of the University of California. All rights reserved. 4218792Snp * Modifications/enhancements: 5218792Snp * Copyright (c) 1995 John S. Dyson. All rights reserved. 6218792Snp * 7218792Snp * Redistribution and use in source and binary forms, with or without 8218792Snp * modification, are permitted provided that the following conditions 9218792Snp * are met: 10218792Snp * 1. Redistributions of source code must retain the above copyright 11218792Snp * notice, this list of conditions and the following disclaimer. 12218792Snp * 2. Redistributions in binary form must reproduce the above copyright 13218792Snp * notice, this list of conditions and the following disclaimer in the 14218792Snp * documentation and/or other materials provided with the distribution. 15218792Snp * 3. All advertising materials mentioning features or use of this software 16218792Snp * must display the following acknowledgement: 17218792Snp * This product includes software developed by the University of 18218792Snp * California, Berkeley and its contributors. 19218792Snp * 4. Neither the name of the University nor the names of its contributors 20218792Snp * may be used to endorse or promote products derived from this software 21218792Snp * without specific prior written permission. 22218792Snp * 23218792Snp * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24218792Snp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25218792Snp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26218792Snp * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27218792Snp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28218792Snp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29218792Snp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30218792Snp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31218792Snp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32237819Snp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33218792Snp * SUCH DAMAGE. 34218792Snp * 35218792Snp * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 36218792Snp * $FreeBSD: head/sys/kern/vfs_cluster.c 68868 2000-11-17 23:40:08Z tegge $ 37218792Snp */ 38218792Snp 39218792Snp#include "opt_debug_cluster.h" 40219286Snp 41219286Snp#include <sys/param.h> 42219286Snp#include <sys/systm.h> 43218792Snp#include <sys/kernel.h> 44218792Snp#include <sys/proc.h> 45218792Snp#include <sys/bio.h> 46218792Snp#include <sys/buf.h> 47218792Snp#include <sys/vnode.h> 48219436Snp#include <sys/malloc.h> 49218792Snp#include <sys/mount.h> 50218792Snp#include <sys/resourcevar.h> 51218792Snp#include <vm/vm.h> 52218792Snp#include <vm/vm_object.h> 53218792Snp#include <vm/vm_page.h> 54218792Snp#include <sys/sysctl.h> 55218792Snp 56218792Snp#if defined(CLUSTERDEBUG) 57222003Snp#include <sys/sysctl.h> 58248925Snpstatic int rcluster= 0; 59248925SnpSYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 60248925Snp#endif 61248925Snp 62218792Snpstatic MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 63218792Snp 64221474Snpstatic struct cluster_save * 65218792Snp cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 66218792Snpstatic struct buf * 67218792Snp cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 68222509Snp daddr_t blkno, long size, int run, struct buf *fbp)); 69218792Snp 70218792Snpstatic int write_behind = 1; 71218792SnpSYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, ""); 72218792Snp 73218792Snpextern vm_page_t bogus_page; 74218792Snp 75218792Snpextern int cluster_pbuf_freecnt; 76218792Snp 77218792Snp/* 78218792Snp * Maximum number of blocks for read-ahead. 79227843Smarius */ 80218792Snp#define MAXRA 32 81218792Snp 82218792Snp/* 83218792Snp * This replaces bread. 84218792Snp */ 85218792Snpint 86218792Snpcluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 87218792Snp struct vnode *vp; 88218792Snp u_quad_t filesize; 89218792Snp daddr_t lblkno; 90218792Snp long size; 91218792Snp struct ucred *cred; 92218792Snp long totread; 93218792Snp int seqcount; 94218792Snp struct buf **bpp; 95218792Snp{ 96218792Snp struct buf *bp, *rbp, *reqbp; 97218792Snp daddr_t blkno, origblkno; 98218792Snp int error, num_ra; 99218792Snp int i; 100218792Snp int maxra, racluster; 101218792Snp long origtotread; 102218792Snp 103218792Snp error = 0; 104218792Snp 105218792Snp /* 106218792Snp * Try to limit the amount of read-ahead by a few 107218792Snp * ad-hoc parameters. This needs work!!! 108218792Snp */ 109218792Snp racluster = vp->v_mount->mnt_iosize_max / size; 110218792Snp maxra = 2 * racluster + (totread / size); 111218792Snp if (maxra > MAXRA) 112218792Snp maxra = MAXRA; 113218792Snp if (maxra > nbuf/8) 114218792Snp maxra = nbuf/8; 115218792Snp 116218792Snp /* 117248925Snp * get the requested block 118248925Snp */ 119248925Snp *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 120248925Snp origblkno = lblkno; 121248925Snp origtotread = totread; 122248925Snp 123248925Snp /* 124248925Snp * if it is in the cache, then check to see if the reads have been 125248925Snp * sequential. If they have, then try some read-ahead, otherwise 126248925Snp * back-off on prospective read-aheads. 127248925Snp */ 128248925Snp if (bp->b_flags & B_CACHE) { 129248925Snp if (!seqcount) { 130248925Snp return 0; 131248925Snp } else if ((bp->b_flags & B_RAM) == 0) { 132248925Snp return 0; 133248925Snp } else { 134248925Snp int s; 135248925Snp struct buf *tbp; 136248925Snp bp->b_flags &= ~B_RAM; 137248925Snp /* 138248925Snp * We do the spl here so that there is no window 139248925Snp * between the incore and the b_usecount increment 140248925Snp * below. We opt to keep the spl out of the loop 141248925Snp * for efficiency. 142248925Snp */ 143248925Snp s = splbio(); 144248925Snp for (i = 1; i < maxra; i++) { 145248925Snp 146248925Snp if (!(tbp = incore(vp, lblkno+i))) { 147248925Snp break; 148248925Snp } 149218792Snp 150218792Snp /* 151218792Snp * Set another read-ahead mark so we know 152218792Snp * to check again. 153218792Snp */ 154218792Snp if (((i % racluster) == (racluster - 1)) || 155218792Snp (i == (maxra - 1))) 156218792Snp tbp->b_flags |= B_RAM; 157248925Snp } 158218792Snp splx(s); 159237263Snp if (i >= maxra) { 160237263Snp return 0; 161237263Snp } 162237263Snp lblkno += i; 163228561Snp } 164228561Snp reqbp = bp = NULL; 165237263Snp } else { 166228561Snp off_t firstread = bp->b_offset; 167228561Snp 168228561Snp KASSERT(bp->b_offset != NOOFFSET, 169218792Snp ("cluster_read: no buffer offset")); 170218792Snp if (firstread + totread > filesize) 171228561Snp totread = filesize - firstread; 172248925Snp if (totread > size) { 173248925Snp int nblks = 0; 174248925Snp int ncontigafter; 175248925Snp while (totread > 0) { 176248925Snp nblks++; 177248925Snp totread -= size; 178248925Snp } 179218792Snp if (nblks == 1) 180218792Snp goto single_block_read; 181218792Snp if (nblks > racluster) 182228561Snp nblks = racluster; 183218792Snp 184228561Snp error = VOP_BMAP(vp, lblkno, NULL, 185228561Snp &blkno, &ncontigafter, NULL); 186228561Snp if (error) 187218792Snp goto single_block_read; 188228561Snp if (blkno == -1) 189228561Snp goto single_block_read; 190228561Snp if (ncontigafter == 0) 191218792Snp goto single_block_read; 192228561Snp if (ncontigafter + 1 < nblks) 193228561Snp nblks = ncontigafter + 1; 194228561Snp 195218792Snp bp = cluster_rbuild(vp, filesize, lblkno, 196228561Snp blkno, size, nblks, bp); 197228561Snp lblkno += (bp->b_bufsize / size); 198228561Snp } else { 199218792Snpsingle_block_read: 200237263Snp /* 201228561Snp * if it isn't in the cache, then get a chunk from 202228561Snp * disk if sequential, otherwise just get the block. 203228561Snp */ 204228561Snp bp->b_flags |= B_RAM; 205228561Snp bp->b_iocmd = BIO_READ; 206228561Snp lblkno += 1; 207228561Snp } 208228561Snp } 209228561Snp 210228561Snp /* 211228561Snp * if we have been doing sequential I/O, then do some read-ahead 212228561Snp */ 213228561Snp rbp = NULL; 214228561Snp if (seqcount && (lblkno < (origblkno + seqcount))) { 215228561Snp /* 216228561Snp * we now build the read-ahead buffer if it is desirable. 217228561Snp */ 218218792Snp if (((u_quad_t)(lblkno + 1) * size) <= filesize && 219218792Snp !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 220218792Snp blkno != -1) { 221228561Snp int nblksread; 222228561Snp int ntoread = num_ra + 1; 223228561Snp nblksread = (origtotread + size - 1) / size; 224218792Snp if (seqcount < nblksread) 225234833Snp seqcount = nblksread; 226228561Snp if (seqcount < ntoread) 227228561Snp ntoread = seqcount; 228218792Snp if (num_ra) { 229228561Snp rbp = cluster_rbuild(vp, filesize, lblkno, 230228561Snp blkno, size, ntoread, NULL); 231228561Snp } else { 232218792Snp rbp = getblk(vp, lblkno, size, 0, 0); 233234833Snp rbp->b_flags |= B_ASYNC | B_RAM; 234228561Snp rbp->b_iocmd = BIO_READ; 235228561Snp rbp->b_blkno = blkno; 236218792Snp } 237218792Snp } 238218792Snp } 239218792Snp 240228561Snp /* 241228561Snp * handle the synchronous read 242218792Snp */ 243228561Snp if (bp) { 244228561Snp#if defined(CLUSTERDEBUG) 245218792Snp if (rcluster) 246218792Snp printf("S(%ld,%ld,%d) ", 247228561Snp (long)bp->b_lblkno, bp->b_bcount, seqcount); 248218792Snp#endif 249228561Snp if ((bp->b_flags & B_CLUSTER) == 0) 250228561Snp vfs_busy_pages(bp, 0); 251218792Snp bp->b_flags &= ~B_INVAL; 252218792Snp bp->b_ioflags &= ~BIO_ERROR; 253228561Snp if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) 254218792Snp BUF_KERNPROC(bp); 255248925Snp error = VOP_STRATEGY(vp, bp); 256248925Snp curproc->p_stats->p_ru.ru_inblock++; 257248925Snp } 258249376Snp 259248925Snp /* 260228561Snp * and if we have read-aheads, do them too 261218792Snp */ 262228561Snp if (rbp) { 263247347Snp if (error) { 264247347Snp rbp->b_flags &= ~B_ASYNC; 265247347Snp brelse(rbp); 266247347Snp } else if (rbp->b_flags & B_CACHE) { 267247347Snp rbp->b_flags &= ~B_ASYNC; 268247347Snp bqrelse(rbp); 269247347Snp } else { 270228561Snp#if defined(CLUSTERDEBUG) 271228561Snp if (rcluster) { 272228561Snp if (bp) 273228561Snp printf("A+(%ld,%ld,%ld,%d) ", 274228561Snp (long)rbp->b_lblkno, rbp->b_bcount, 275221474Snp (long)(rbp->b_lblkno - origblkno), 276228561Snp seqcount); 277228561Snp else 278228561Snp printf("A(%ld,%ld,%ld,%d) ", 279238028Snp (long)rbp->b_lblkno, rbp->b_bcount, 280228561Snp (long)(rbp->b_lblkno - origblkno), 281228561Snp seqcount); 282228561Snp } 283228561Snp#endif 284228561Snp 285228561Snp if ((rbp->b_flags & B_CLUSTER) == 0) 286228561Snp vfs_busy_pages(rbp, 0); 287228561Snp rbp->b_flags &= ~B_INVAL; 288228561Snp rbp->b_ioflags &= ~BIO_ERROR; 289228561Snp if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) 290228561Snp BUF_KERNPROC(rbp); 291248925Snp (void) VOP_STRATEGY(vp, rbp); 292248925Snp curproc->p_stats->p_ru.ru_inblock++; 293248925Snp } 294218792Snp } 295219944Snp if (reqbp) 296218792Snp return (bufwait(reqbp)); 297228561Snp else 298218792Snp return (error); 299218792Snp} 300218792Snp 301218792Snp/* 302237263Snp * If blocks are contiguous on disk, use this to provide clustered 303228561Snp * read ahead. We will read as many blocks as possible sequentially 304228561Snp * and then parcel them up into logical blocks in the buffer hash table. 305228561Snp */ 306228561Snpstatic struct buf * 307228561Snpcluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 308218792Snp struct vnode *vp; 309218792Snp u_quad_t filesize; 310221474Snp daddr_t lbn; 311221474Snp daddr_t blkno; 312221474Snp long size; 313221474Snp int run; 314221474Snp struct buf *fbp; 315222509Snp{ 316221474Snp struct buf *bp, *tbp; 317221474Snp daddr_t bn; 318221474Snp int i, inc, j; 319221474Snp 320218792Snp KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 321218792Snp ("cluster_rbuild: size %ld != filesize %ld\n", 322218792Snp size, vp->v_mount->mnt_stat.f_iosize)); 323218792Snp 324218792Snp /* 325218792Snp * avoid a division 326218792Snp */ 327218792Snp while ((u_quad_t) size * (lbn + run) > filesize) { 328218792Snp --run; 329218792Snp } 330218792Snp 331248925Snp if (fbp) { 332248925Snp tbp = fbp; 333218792Snp tbp->b_iocmd = BIO_READ; 334248925Snp } else { 335248925Snp tbp = getblk(vp, lbn, size, 0, 0); 336248925Snp if (tbp->b_flags & B_CACHE) 337248925Snp return tbp; 338248925Snp tbp->b_flags |= B_ASYNC | B_RAM; 339218792Snp tbp->b_iocmd = BIO_READ; 340218792Snp } 341218792Snp 342248925Snp tbp->b_blkno = blkno; 343248925Snp if( (tbp->b_flags & B_MALLOC) || 344228561Snp ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 345228561Snp return tbp; 346247291Snp 347218792Snp bp = trypbuf(&cluster_pbuf_freecnt); 348218792Snp if (bp == 0) 349218792Snp return tbp; 350218792Snp 351218792Snp bp->b_data = (char *)((vm_offset_t)bp->b_data | 352240453Snp ((vm_offset_t)tbp->b_data & PAGE_MASK)); 353228561Snp bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; 354228561Snp bp->b_iocmd = BIO_READ; 355228561Snp bp->b_iodone = cluster_callback; 356228561Snp bp->b_blkno = blkno; 357228561Snp bp->b_lblkno = lbn; 358228561Snp bp->b_offset = tbp->b_offset; 359228561Snp KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 360218792Snp pbgetvp(vp, bp); 361228561Snp 362218792Snp TAILQ_INIT(&bp->b_cluster.cluster_head); 363218792Snp 364218792Snp bp->b_bcount = 0; 365218792Snp bp->b_bufsize = 0; 366218792Snp bp->b_npages = 0; 367237263Snp 368228561Snp inc = btodb(size); 369228561Snp for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 370237263Snp if (i != 0) { 371239336Snp if ((bp->b_npages * PAGE_SIZE) + 372218792Snp round_page(size) > vp->v_mount->mnt_iosize_max) 373218792Snp break; 374219436Snp 375228561Snp if ((tbp = incore(vp, lbn + i)) != NULL) { 376218792Snp if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) 377218792Snp break; 378218792Snp BUF_UNLOCK(tbp); 379218792Snp 380218792Snp for (j = 0; j < tbp->b_npages; j++) 381231115Snp if (tbp->b_pages[j]->valid) 382228561Snp break; 383247122Snp 384247122Snp if (j != tbp->b_npages) 385247122Snp break; 386228561Snp 387228561Snp if (tbp->b_bcount != size) 388222551Snp break; 389228561Snp } 390228561Snp 391228561Snp tbp = getblk(vp, lbn + i, size, 0, 0); 392228561Snp 393228561Snp if ((tbp->b_flags & B_CACHE) || 394228561Snp (tbp->b_flags & B_VMIO) == 0) { 395228561Snp bqrelse(tbp); 396228561Snp break; 397228561Snp } 398228561Snp 399228561Snp for (j = 0;j < tbp->b_npages; j++) 400248925Snp if (tbp->b_pages[j]->valid) 401231115Snp break; 402219286Snp 403221474Snp if (j != tbp->b_npages) { 404221474Snp bqrelse(tbp); 405221474Snp break; 406221474Snp } 407221474Snp 408222552Snp if ((fbp && (i == 1)) || (i == (run - 1))) 409221474Snp tbp->b_flags |= B_RAM; 410221474Snp tbp->b_flags |= B_ASYNC; 411221474Snp tbp->b_iocmd = BIO_READ; 412222509Snp if (tbp->b_blkno == tbp->b_lblkno) { 413221474Snp tbp->b_blkno = bn; 414221474Snp } else if (tbp->b_blkno != bn) { 415222973Snp brelse(tbp); 416245274Snp break; 417248925Snp } 418241399Snp } 419237263Snp /* 420228561Snp * XXX fbp from caller may not be B_ASYNC, but we are going 421228561Snp * to biodone() it in cluster_callback() anyway 422249370Snp */ 423218792Snp BUF_KERNPROC(tbp); 424248925Snp TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 425218792Snp tbp, b_cluster.cluster_entry); 426218792Snp for (j = 0; j < tbp->b_npages; j += 1) { 427218792Snp vm_page_t m; 428237587Snp m = tbp->b_pages[j]; 429237587Snp vm_page_io_start(m); 430237587Snp vm_object_pip_add(m->object, 1); 431237587Snp if ((bp->b_npages == 0) || 432237587Snp (bp->b_pages[bp->b_npages-1] != m)) { 433237587Snp bp->b_pages[bp->b_npages] = m; 434237587Snp bp->b_npages++; 435237587Snp } 436237587Snp if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 437237587Snp tbp->b_pages[j] = bogus_page; 438237587Snp } 439237587Snp bp->b_bcount += tbp->b_bcount; 440244580Snp bp->b_bufsize += tbp->b_bufsize; 441248925Snp } 442248925Snp 443248925Snp for(j=0;j<bp->b_npages;j++) { 444218792Snp if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 445218792Snp VM_PAGE_BITS_ALL) 446237263Snp bp->b_pages[j] = bogus_page; 447237263Snp } 448237263Snp if (bp->b_bufsize > bp->b_kvasize) 449237263Snp panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 450237263Snp bp->b_bufsize, bp->b_kvasize); 451237263Snp bp->b_kvasize = bp->b_bufsize; 452228561Snp 453228561Snp pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 454228561Snp (vm_page_t *)bp->b_pages, bp->b_npages); 455239336Snp return (bp); 456240452Snp} 457240452Snp 458239336Snp/* 459218792Snp * Cleanup after a clustered read or write. 460218792Snp * This is complicated by the fact that any of the buffers might have 461218792Snp * extra memory (if there were no empty buffer headers at allocbuf time) 462218792Snp * that we will need to shift around. 463218792Snp */ 464218792Snpvoid 465237587Snpcluster_callback(bp) 466218792Snp struct buf *bp; 467218792Snp{ 468218792Snp struct buf *nbp, *tbp; 469218792Snp int error = 0; 470237587Snp 471237587Snp /* 472237587Snp * Must propogate errors to all the components. 473237587Snp */ 474240452Snp if (bp->b_ioflags & BIO_ERROR) 475237587Snp error = bp->b_error; 476218792Snp 477218792Snp pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 478218792Snp /* 479218792Snp * Move memory from the large cluster buffer into the component 480218792Snp * buffers and mark IO as done on these. 481218792Snp */ 482218792Snp for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 483218792Snp tbp; tbp = nbp) { 484218792Snp nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 485248925Snp if (error) { 486248925Snp tbp->b_ioflags |= BIO_ERROR; 487248925Snp tbp->b_error = error; 488248925Snp } else { 489248925Snp tbp->b_dirtyoff = tbp->b_dirtyend = 0; 490248925Snp tbp->b_flags &= ~B_INVAL; 491248925Snp tbp->b_ioflags &= ~BIO_ERROR; 492248925Snp } 493248925Snp bufdone(tbp); 494248925Snp } 495248925Snp relpbuf(bp, &cluster_pbuf_freecnt); 496248925Snp} 497248925Snp 498248925Snp/* 499248925Snp * cluster_wbuild_wb: 500248925Snp * 501248925Snp * Implement modified write build for cluster. 502248925Snp * 503248925Snp * write_behind = 0 write behind disabled 504248925Snp * write_behind = 1 write behind normal (default) 505248925Snp * write_behind = 2 write behind backed-off 506248925Snp */ 507248925Snp 508248925Snpstatic __inline int 509248925Snpcluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) 510218792Snp{ 511218792Snp int r = 0; 512218792Snp 513218792Snp switch(write_behind) { 514218792Snp case 2: 515218792Snp if (start_lbn < len) 516237263Snp break; 517228561Snp start_lbn -= len; 518228561Snp /* fall through */ 519218792Snp case 1: 520218792Snp r = cluster_wbuild(vp, size, start_lbn, len); 521218792Snp /* fall through */ 522218792Snp default: 523218792Snp /* fall through */ 524222085Snp break; 525228561Snp } 526228561Snp return(r); 527222085Snp} 528240680Sgavin 529240680Sgavin/* 530240680Sgavin * Do clustered write for FFS. 531222085Snp * 532222085Snp * Three cases: 533218792Snp * 1. Write is not sequential (write asynchronously) 534218792Snp * Write is sequential: 535218792Snp * 2. beginning of cluster - begin cluster 536228561Snp * 3. middle of a cluster - add to cluster 537228561Snp * 4. end of a cluster - asynchronously write cluster 538228561Snp */ 539218792Snpvoid 540228561Snpcluster_write(bp, filesize, seqcount) 541228561Snp struct buf *bp; 542228561Snp u_quad_t filesize; 543228561Snp int seqcount; 544248925Snp{ 545218792Snp struct vnode *vp; 546218792Snp daddr_t lbn; 547218792Snp int maxclen, cursize; 548237587Snp int lblocksize; 549237587Snp int async; 550237587Snp 551237587Snp vp = bp->b_vp; 552237587Snp if (vp->v_type == VREG) { 553237587Snp async = vp->v_mount->mnt_flag & MNT_ASYNC; 554237587Snp lblocksize = vp->v_mount->mnt_stat.f_iosize; 555237587Snp } else { 556237587Snp async = 0; 557218792Snp lblocksize = bp->b_bufsize; 558237263Snp } 559240452Snp lbn = bp->b_lblkno; 560228561Snp KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 561240452Snp 562239336Snp /* Initialize vnode to beginning of file. */ 563239338Snp if (lbn == 0) 564248925Snp vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 565218792Snp 566218792Snp if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 567218792Snp (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 568218792Snp maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; 569218792Snp if (vp->v_clen != 0) { 570218792Snp /* 571218792Snp * Next block is not sequential. 572218792Snp * 573228561Snp * If we are not writing at end of file, the process 574228561Snp * seeked to another point in the file since its last 575228561Snp * write, or we have reached our maximum cluster size, 576228561Snp * then push the previous cluster. Otherwise try 577228561Snp * reallocating to make it sequential. 578228561Snp * 579248925Snp * Change to algorithm: only push previous cluster if 580248925Snp * it was sequential from the point of view of the 581248925Snp * seqcount heuristic, otherwise leave the buffer 582248925Snp * intact so we can potentially optimize the I/O 583248925Snp * later on in the buf_daemon or update daemon 584248925Snp * flush. 585248925Snp */ 586218792Snp cursize = vp->v_lastw - vp->v_cstart + 1; 587228561Snp if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 588228561Snp lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 589228561Snp if (!async && seqcount > 0) { 590228561Snp cluster_wbuild_wb(vp, lblocksize, 591228561Snp vp->v_cstart, cursize); 592228561Snp } 593218792Snp } else { 594218792Snp struct buf **bpp, **endbp; 595218792Snp struct cluster_save *buflist; 596218792Snp 597218792Snp buflist = cluster_collectbufs(vp, bp); 598248925Snp endbp = &buflist->bs_children 599228561Snp [buflist->bs_nchildren - 1]; 600228561Snp if (VOP_REALLOCBLKS(vp, buflist)) { 601222551Snp /* 602248925Snp * Failed, push the previous cluster 603228561Snp * if *really* writing sequentially 604228561Snp * in the logical file (seqcount > 1), 605218792Snp * otherwise delay it in the hopes that 606248925Snp * the low level disk driver can 607228561Snp * optimize the write ordering. 608228561Snp */ 609218792Snp for (bpp = buflist->bs_children; 610228561Snp bpp < endbp; bpp++) 611228561Snp brelse(*bpp); 612218792Snp free(buflist, M_SEGMENT); 613218792Snp if (seqcount > 1) { 614218792Snp cluster_wbuild_wb(vp, 615218792Snp lblocksize, vp->v_cstart, 616218792Snp cursize); 617218792Snp } 618218792Snp } else { 619218792Snp /* 620218792Snp * Succeeded, keep building cluster. 621218792Snp */ 622218792Snp for (bpp = buflist->bs_children; 623218792Snp bpp <= endbp; bpp++) 624218792Snp bdwrite(*bpp); 625218792Snp free(buflist, M_SEGMENT); 626218792Snp vp->v_lastw = lbn; 627218792Snp vp->v_lasta = bp->b_blkno; 628218792Snp return; 629218792Snp } 630218792Snp } 631218792Snp } 632218792Snp /* 633218792Snp * Consider beginning a cluster. If at end of file, make 634218792Snp * cluster as large as possible, otherwise find size of 635218792Snp * existing cluster. 636218792Snp */ 637218792Snp if ((vp->v_type == VREG) && 638218792Snp ((u_quad_t) bp->b_offset + lblocksize) != filesize && 639218792Snp (bp->b_blkno == bp->b_lblkno) && 640222510Snp (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 641222510Snp bp->b_blkno == -1)) { 642218792Snp bawrite(bp); 643218792Snp vp->v_clen = 0; 644218792Snp vp->v_lasta = bp->b_blkno; 645218792Snp vp->v_cstart = lbn + 1; 646218792Snp vp->v_lastw = lbn; 647218792Snp return; 648218792Snp } 649218792Snp vp->v_clen = maxclen; 650228561Snp if (!async && maxclen == 0) { /* I/O not contiguous */ 651228561Snp vp->v_cstart = lbn + 1; 652218792Snp bawrite(bp); 653218792Snp } else { /* Wait for rest of cluster */ 654228561Snp vp->v_cstart = lbn; 655228561Snp bdwrite(bp); 656218792Snp } 657218792Snp } else if (lbn == vp->v_cstart + vp->v_clen) { 658218792Snp /* 659218792Snp * At end of cluster, write it out if seqcount tells us we 660228561Snp * are operating sequentially, otherwise let the buf or 661228561Snp * update daemon handle it. 662218792Snp */ 663248925Snp bdwrite(bp); 664218792Snp if (seqcount > 1) 665218792Snp cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 666218792Snp vp->v_clen = 0; 667218792Snp vp->v_cstart = lbn + 1; 668218792Snp } else { 669218792Snp /* 670218792Snp * In the middle of a cluster, so just delay the I/O for now. 671218792Snp */ 672218792Snp bdwrite(bp); 673218792Snp } 674218792Snp vp->v_lastw = lbn; 675218792Snp vp->v_lasta = bp->b_blkno; 676218792Snp} 677218792Snp 678218792Snp 679218792Snp/* 680218792Snp * This is an awful lot like cluster_rbuild...wish they could be combined. 681218792Snp * The last lbn argument is the current block on which I/O is being 682228561Snp * performed. Check to see that it doesn't fall in the middle of 683218792Snp * the current block (if last_bp == NULL). 684218792Snp */ 685218792Snpint 686218792Snpcluster_wbuild(vp, size, start_lbn, len) 687220873Snp struct vnode *vp; 688228561Snp long size; 689218792Snp daddr_t start_lbn; 690222510Snp int len; 691237263Snp{ 692228561Snp struct buf *bp, *tbp; 693228561Snp int i, j, s; 694228561Snp int totalwritten = 0; 695228561Snp int dbsize = btodb(size); 696228561Snp 697228561Snp while (len > 0) { 698228561Snp s = splbio(); 699228561Snp if (((tbp = gbincore(vp, start_lbn)) == NULL) || 700228561Snp ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) || 701228561Snp BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { 702228561Snp ++start_lbn; 703228561Snp --len; 704228561Snp splx(s); 705228561Snp continue; 706228561Snp } 707220873Snp bremfree(tbp); 708218792Snp tbp->b_flags &= ~B_DONE; 709218792Snp splx(s); 710218792Snp 711218792Snp /* 712218792Snp * Extra memory in the buffer, punt on this buffer. 713218792Snp * XXX we could handle this in most cases, but we would 714218792Snp * have to push the extra memory down to after our max 715218792Snp * possible cluster size and then potentially pull it back 716218792Snp * up if the cluster was terminated prematurely--too much 717218792Snp * hassle. 718218792Snp */ 719218792Snp if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 720228561Snp (B_CLUSTEROK | B_VMIO)) || 721222509Snp (tbp->b_bcount != tbp->b_bufsize) || 722218792Snp (tbp->b_bcount != size) || 723218792Snp (len == 1) || 724218792Snp ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { 725218792Snp totalwritten += tbp->b_bufsize; 726218792Snp bawrite(tbp); 727237263Snp ++start_lbn; 728228561Snp --len; 729228561Snp continue; 730218792Snp } 731218792Snp 732218792Snp /* 733218792Snp * We got a pbuf to make the cluster in. 734218792Snp * so initialise it. 735218792Snp */ 736218792Snp TAILQ_INIT(&bp->b_cluster.cluster_head); 737218792Snp bp->b_bcount = 0; 738228561Snp bp->b_bufsize = 0; 739228561Snp bp->b_npages = 0; 740228561Snp if (tbp->b_wcred != NOCRED) { 741228561Snp bp->b_wcred = tbp->b_wcred; 742228561Snp crhold(bp->b_wcred); 743228561Snp } 744228561Snp 745218792Snp bp->b_blkno = tbp->b_blkno; 746218792Snp bp->b_lblkno = tbp->b_lblkno; 747218792Snp bp->b_offset = tbp->b_offset; 748228561Snp bp->b_data = (char *)((vm_offset_t)bp->b_data | 749237263Snp ((vm_offset_t)tbp->b_data & PAGE_MASK)); 750228561Snp bp->b_flags |= B_CLUSTER | 751228561Snp (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 752228561Snp bp->b_iodone = cluster_callback; 753228561Snp pbgetvp(vp, bp); 754228561Snp /* 755228561Snp * From this location in the file, scan forward to see 756228561Snp * if there are buffers with adjacent data that need to 757228561Snp * be written as well. 758228561Snp */ 759228561Snp for (i = 0; i < len; ++i, ++start_lbn) { 760228561Snp if (i != 0) { /* If not the first buffer */ 761228561Snp s = splbio(); 762228561Snp /* 763228561Snp * If the adjacent data is not even in core it 764218792Snp * can't need to be written. 765218792Snp */ 766240453Snp if ((tbp = gbincore(vp, start_lbn)) == NULL) { 767240453Snp splx(s); 768240453Snp break; 769240453Snp } 770240453Snp 771240453Snp /* 772240453Snp * If it IS in core, but has different 773218792Snp * characteristics, don't cluster with it. 774218792Snp */ 775218792Snp if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 776218792Snp B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 777218792Snp != (B_DELWRI | B_CLUSTEROK | 778218792Snp (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 779218792Snp tbp->b_wcred != bp->b_wcred || 780218792Snp BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) { 781228561Snp splx(s); 782228561Snp break; 783228561Snp } 784228561Snp 785228561Snp /* 786228561Snp * Check that the combined cluster 787218792Snp * would make sense with regard to pages 788218792Snp * and would not be too large 789218792Snp */ 790228561Snp if ((tbp->b_bcount != size) || 791228561Snp ((bp->b_blkno + (dbsize * i)) != 792228561Snp tbp->b_blkno) || 793228561Snp ((tbp->b_npages + bp->b_npages) > 794228561Snp (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { 795228561Snp BUF_UNLOCK(tbp); 796228561Snp splx(s); 797218792Snp break; 798218792Snp } 799228561Snp /* 800228561Snp * Ok, it's passed all the tests, 801218792Snp * so remove it from the free list 802218792Snp * and mark it busy. We will use it. 803218792Snp */ 804218792Snp bremfree(tbp); 805218792Snp tbp->b_flags &= ~B_DONE; 806218792Snp splx(s); 807218792Snp } /* end of code for non-first buffers only */ 808218792Snp /* check for latent dependencies to be handled */ 809218792Snp if ((LIST_FIRST(&tbp->b_dep)) != NULL) 810218792Snp buf_start(tbp); 811218792Snp /* 812218792Snp * If the IO is via the VM then we do some 813228561Snp * special VM hackery. (yuck) 814218792Snp */ 815218792Snp if (tbp->b_flags & B_VMIO) { 816218792Snp vm_page_t m; 817228561Snp 818228561Snp if (i != 0) { /* if not first buffer */ 819228561Snp for (j = 0; j < tbp->b_npages; j += 1) { 820228561Snp m = tbp->b_pages[j]; 821218792Snp if (m->flags & PG_BUSY) { 822228561Snp bqrelse(tbp); 823228561Snp goto finishcluster; 824218792Snp } 825228561Snp } 826228561Snp } 827228561Snp 828228561Snp for (j = 0; j < tbp->b_npages; j += 1) { 829228561Snp m = tbp->b_pages[j]; 830228561Snp vm_page_io_start(m); 831228561Snp vm_object_pip_add(m->object, 1); 832240453Snp if ((bp->b_npages == 0) || 833240453Snp (bp->b_pages[bp->b_npages - 1] != m)) { 834240453Snp bp->b_pages[bp->b_npages] = m; 835218792Snp bp->b_npages++; 836218792Snp } 837218792Snp } 838218792Snp } 839218792Snp bp->b_bcount += size; 840218792Snp bp->b_bufsize += size; 841218792Snp 842218792Snp s = splbio(); 843218792Snp bundirty(tbp); 844218792Snp tbp->b_flags &= ~B_DONE; 845218792Snp tbp->b_ioflags &= ~BIO_ERROR; 846218792Snp tbp->b_flags |= B_ASYNC; 847228561Snp tbp->b_iocmd = BIO_WRITE; 848228561Snp reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 849228561Snp ++tbp->b_vp->v_numoutput; 850218792Snp splx(s); 851218792Snp BUF_KERNPROC(tbp); 852218792Snp TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 853219944Snp tbp, b_cluster.cluster_entry); 854218792Snp } 855218792Snp finishcluster: 856218792Snp pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 857218792Snp (vm_page_t *) bp->b_pages, bp->b_npages); 858218792Snp if (bp->b_bufsize > bp->b_kvasize) 859218792Snp panic( 860248925Snp "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 861248925Snp bp->b_bufsize, bp->b_kvasize); 862248925Snp bp->b_kvasize = bp->b_bufsize; 863248925Snp totalwritten += bp->b_bufsize; 864218792Snp bp->b_dirtyoff = 0; 865218792Snp bp->b_dirtyend = bp->b_bufsize; 866218792Snp bawrite(bp); 867218792Snp 868222509Snp len -= i; 869222509Snp } 870222509Snp return totalwritten; 871237263Snp} 872228561Snp 873228561Snp/* 874228561Snp * Collect together all the buffers in a cluster. 875218792Snp * Plus add one additional buffer. 876218792Snp */ 877218792Snpstatic struct cluster_save * 878220873Snpcluster_collectbufs(vp, last_bp) 879218792Snp struct vnode *vp; 880218792Snp struct buf *last_bp; 881221474Snp{ 882218792Snp struct cluster_save *buflist; 883228561Snp struct buf *bp; 884228561Snp daddr_t lbn; 885228561Snp int i, len; 886228561Snp 887228561Snp len = vp->v_lastw - vp->v_cstart + 1; 888228561Snp buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 889218792Snp M_SEGMENT, M_WAITOK); 890245274Snp buflist->bs_nchildren = 0; 891245274Snp buflist->bs_children = (struct buf **) (buflist + 1); 892228561Snp for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 893228561Snp (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 894228561Snp buflist->bs_children[i] = bp; 895218792Snp if (bp->b_blkno == bp->b_lblkno) 896218792Snp VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 897218792Snp NULL, NULL); 898218792Snp } 899218792Snp buflist->bs_children[i] = bp = last_bp; 900218792Snp if (bp->b_blkno == bp->b_lblkno) 901218792Snp VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 902218792Snp NULL, NULL); 903218792Snp buflist->bs_nchildren = i + 1; 904218792Snp return (buflist); 905218792Snp} 906218792Snp