vfs_cluster.c revision 41529
1/*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * Modifications/enhancements: 5 * Copyright (c) 1995 John S. Dyson. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 36 * $Id: vfs_cluster.c,v 1.74 1998/11/17 00:31:12 mckusick Exp $ 37 */ 38 39#include "opt_debug_cluster.h" 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/kernel.h> 44#include <sys/proc.h> 45#include <sys/buf.h> 46#include <sys/vnode.h> 47#include <sys/malloc.h> 48#include <sys/mount.h> 49#include <sys/resourcevar.h> 50#include <vm/vm.h> 51#include <vm/vm_prot.h> 52#include <vm/vm_object.h> 53#include <vm/vm_page.h> 54 55#if defined(CLUSTERDEBUG) 56#include <sys/sysctl.h> 57static int rcluster= 0; 58SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 59#endif 60 61static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 62 63static struct cluster_save * 64 cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 65static struct buf * 66 cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 67 daddr_t blkno, long size, int run, struct buf *fbp)); 68 69extern vm_page_t bogus_page; 70 71/* 72 * Maximum number of blocks for read-ahead. 73 */ 74#define MAXRA 32 75 76/* 77 * This replaces bread. 78 */ 79int 80cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 81 struct vnode *vp; 82 u_quad_t filesize; 83 daddr_t lblkno; 84 long size; 85 struct ucred *cred; 86 long totread; 87 int seqcount; 88 struct buf **bpp; 89{ 90 struct buf *bp, *rbp, *reqbp; 91 daddr_t blkno, origblkno; 92 int error, num_ra; 93 int i; 94 int maxra, racluster; 95 long origtotread; 96 97 error = 0; 98 if (vp->v_maxio == 0) 99 vp->v_maxio = DFLTPHYS; 100 101 /* 102 * Try to limit the amount of read-ahead by a few 103 * ad-hoc parameters. This needs work!!! 104 */ 105 racluster = vp->v_maxio/size; 106 maxra = 2 * racluster + (totread / size); 107 if (maxra > MAXRA) 108 maxra = MAXRA; 109 if (maxra > nbuf/8) 110 maxra = nbuf/8; 111 112 /* 113 * get the requested block 114 */ 115 *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 116 origblkno = lblkno; 117 origtotread = totread; 118 119 /* 120 * if it is in the cache, then check to see if the reads have been 121 * sequential. If they have, then try some read-ahead, otherwise 122 * back-off on prospective read-aheads. 123 */ 124 if (bp->b_flags & B_CACHE) { 125 if (!seqcount) { 126 return 0; 127 } else if ((bp->b_flags & B_RAM) == 0) { 128 return 0; 129 } else { 130 int s; 131 struct buf *tbp; 132 bp->b_flags &= ~B_RAM; 133 /* 134 * We do the spl here so that there is no window 135 * between the incore and the b_usecount increment 136 * below. We opt to keep the spl out of the loop 137 * for efficiency. 138 */ 139 s = splbio(); 140 for(i=1;i<maxra;i++) { 141 142 if (!(tbp = incore(vp, lblkno+i))) { 143 break; 144 } 145 146 /* 147 * Set another read-ahead mark so we know to check 148 * again. 149 */ 150 if (((i % racluster) == (racluster - 1)) || 151 (i == (maxra - 1))) 152 tbp->b_flags |= B_RAM; 153 154 if ((tbp->b_usecount < 1) && 155 ((tbp->b_flags & B_BUSY) == 0) && 156 (tbp->b_qindex == QUEUE_LRU)) { 157 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist); 158 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist); 159 } 160 } 161 splx(s); 162 if (i >= maxra) { 163 return 0; 164 } 165 lblkno += i; 166 } 167 reqbp = bp = NULL; 168 } else { 169 off_t firstread; 170 firstread = bp->b_offset; 171#ifdef DIAGNOSTIC 172 if (bp->b_offset == NOOFFSET) 173 panic("cluster_read: no buffer offset"); 174#endif 175 if (firstread + totread > filesize) 176 totread = filesize - firstread; 177 if (totread > size) { 178 int nblks = 0; 179 int ncontigafter; 180 while (totread > 0) { 181 nblks++; 182 totread -= size; 183 } 184 if (nblks == 1) 185 goto single_block_read; 186 if (nblks > racluster) 187 nblks = racluster; 188 189 error = VOP_BMAP(vp, lblkno, NULL, 190 &blkno, &ncontigafter, NULL); 191 if (error) 192 goto single_block_read; 193 if (blkno == -1) 194 goto single_block_read; 195 if (ncontigafter == 0) 196 goto single_block_read; 197 if (ncontigafter + 1 < nblks) 198 nblks = ncontigafter + 1; 199 200 bp = cluster_rbuild(vp, filesize, lblkno, 201 blkno, size, nblks, bp); 202 lblkno += (bp->b_bufsize / size); 203 } else { 204single_block_read: 205 /* 206 * if it isn't in the cache, then get a chunk from 207 * disk if sequential, otherwise just get the block. 208 */ 209 bp->b_flags |= B_READ | B_RAM; 210 lblkno += 1; 211 } 212 } 213 214 /* 215 * if we have been doing sequential I/O, then do some read-ahead 216 */ 217 rbp = NULL; 218 if (seqcount && (lblkno < (origblkno + seqcount))) { 219 /* 220 * we now build the read-ahead buffer if it is desirable. 221 */ 222 if (((u_quad_t)(lblkno + 1) * size) <= filesize && 223 !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 224 blkno != -1) { 225 int nblksread; 226 int ntoread = num_ra + 1; 227 nblksread = (origtotread + size - 1) / size; 228 if (seqcount < nblksread) 229 seqcount = nblksread; 230 if (seqcount < ntoread) 231 ntoread = seqcount; 232 if (num_ra) { 233 rbp = cluster_rbuild(vp, filesize, lblkno, 234 blkno, size, ntoread, NULL); 235 } else { 236 rbp = getblk(vp, lblkno, size, 0, 0); 237 rbp->b_flags |= B_READ | B_ASYNC | B_RAM; 238 rbp->b_blkno = blkno; 239 } 240 } 241 } 242 243 /* 244 * handle the synchronous read 245 */ 246 if (bp) { 247#if defined(CLUSTERDEBUG) 248 if (rcluster) 249 printf("S(%ld,%ld,%d) ", 250 (long)bp->b_lblkno, bp->b_bcount, seqcount); 251#endif 252 if ((bp->b_flags & B_CLUSTER) == 0) 253 vfs_busy_pages(bp, 0); 254 error = VOP_STRATEGY(vp, bp); 255 curproc->p_stats->p_ru.ru_inblock++; 256 } 257 258 /* 259 * and if we have read-aheads, do them too 260 */ 261 if (rbp) { 262 if (error) { 263 rbp->b_flags &= ~(B_ASYNC | B_READ); 264 brelse(rbp); 265 } else if (rbp->b_flags & B_CACHE) { 266 rbp->b_flags &= ~(B_ASYNC | B_READ); 267 bqrelse(rbp); 268 } else { 269#if defined(CLUSTERDEBUG) 270 if (rcluster) { 271 if (bp) 272 printf("A+(%ld,%ld,%ld,%d) ", 273 (long)rbp->b_lblkno, rbp->b_bcount, 274 (long)(rbp->b_lblkno - origblkno), 275 seqcount); 276 else 277 printf("A(%ld,%ld,%ld,%d) ", 278 (long)rbp->b_lblkno, rbp->b_bcount, 279 (long)(rbp->b_lblkno - origblkno), 280 seqcount); 281 } 282#endif 283 284 if ((rbp->b_flags & B_CLUSTER) == 0) 285 vfs_busy_pages(rbp, 0); 286 (void) VOP_STRATEGY(vp, rbp); 287 curproc->p_stats->p_ru.ru_inblock++; 288 } 289 } 290 if (reqbp) 291 return (biowait(reqbp)); 292 else 293 return (error); 294} 295 296/* 297 * If blocks are contiguous on disk, use this to provide clustered 298 * read ahead. We will read as many blocks as possible sequentially 299 * and then parcel them up into logical blocks in the buffer hash table. 300 */ 301static struct buf * 302cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 303 struct vnode *vp; 304 u_quad_t filesize; 305 daddr_t lbn; 306 daddr_t blkno; 307 long size; 308 int run; 309 struct buf *fbp; 310{ 311 struct buf *bp, *tbp; 312 daddr_t bn; 313 int i, inc, j; 314 315#ifdef DIAGNOSTIC 316 if (size != vp->v_mount->mnt_stat.f_iosize) 317 panic("cluster_rbuild: size %ld != filesize %ld\n", 318 size, vp->v_mount->mnt_stat.f_iosize); 319#endif 320 /* 321 * avoid a division 322 */ 323 while ((u_quad_t) size * (lbn + run) > filesize) { 324 --run; 325 } 326 327 if (fbp) { 328 tbp = fbp; 329 tbp->b_flags |= B_READ; 330 } else { 331 tbp = getblk(vp, lbn, size, 0, 0); 332 if (tbp->b_flags & B_CACHE) 333 return tbp; 334 tbp->b_flags |= B_ASYNC | B_READ | B_RAM; 335 } 336 337 tbp->b_blkno = blkno; 338 if( (tbp->b_flags & B_MALLOC) || 339 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 340 return tbp; 341 342 bp = trypbuf(); 343 if (bp == 0) 344 return tbp; 345 346 bp->b_data = (char *)((vm_offset_t)bp->b_data | 347 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 348 bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 349 bp->b_iodone = cluster_callback; 350 bp->b_blkno = blkno; 351 bp->b_lblkno = lbn; 352 bp->b_offset = tbp->b_offset; 353#ifdef DIAGNOSTIC 354 if (bp->b_offset == NOOFFSET) 355 panic("cluster_rbuild: no buffer offset"); 356#endif 357 pbgetvp(vp, bp); 358 359 TAILQ_INIT(&bp->b_cluster.cluster_head); 360 361 bp->b_bcount = 0; 362 bp->b_bufsize = 0; 363 bp->b_npages = 0; 364 365 if (vp->v_maxio == 0) 366 vp->v_maxio = DFLTPHYS; 367 inc = btodb(size); 368 for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 369 if (i != 0) { 370 if ((bp->b_npages * PAGE_SIZE) + 371 round_page(size) > vp->v_maxio) 372 break; 373 374 if (tbp = incore(vp, lbn + i)) { 375 if (tbp->b_flags & B_BUSY) 376 break; 377 378 for (j = 0; j < tbp->b_npages; j++) 379 if (tbp->b_pages[j]->valid) 380 break; 381 382 if (j != tbp->b_npages) 383 break; 384 385 if (tbp->b_bcount != size) 386 break; 387 } 388 389 tbp = getblk(vp, lbn + i, size, 0, 0); 390 391 if ((tbp->b_flags & B_CACHE) || 392 (tbp->b_flags & B_VMIO) == 0) { 393 bqrelse(tbp); 394 break; 395 } 396 397 for (j = 0;j < tbp->b_npages; j++) 398 if (tbp->b_pages[j]->valid) 399 break; 400 401 if (j != tbp->b_npages) { 402 bqrelse(tbp); 403 break; 404 } 405 406 if ((fbp && (i == 1)) || (i == (run - 1))) 407 tbp->b_flags |= B_RAM; 408 tbp->b_flags |= B_READ | B_ASYNC; 409 if (tbp->b_blkno == tbp->b_lblkno) { 410 tbp->b_blkno = bn; 411 } else if (tbp->b_blkno != bn) { 412 brelse(tbp); 413 break; 414 } 415 } 416 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 417 tbp, b_cluster.cluster_entry); 418 for (j = 0; j < tbp->b_npages; j += 1) { 419 vm_page_t m; 420 m = tbp->b_pages[j]; 421 vm_page_io_start(m); 422 vm_object_pip_add(m->object, 1); 423 if ((bp->b_npages == 0) || 424 (bp->b_pages[bp->b_npages-1] != m)) { 425 bp->b_pages[bp->b_npages] = m; 426 bp->b_npages++; 427 } 428 if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 429 tbp->b_pages[j] = bogus_page; 430 } 431 bp->b_bcount += tbp->b_bcount; 432 bp->b_bufsize += tbp->b_bufsize; 433 } 434 435 for(j=0;j<bp->b_npages;j++) { 436 if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 437 VM_PAGE_BITS_ALL) 438 bp->b_pages[j] = bogus_page; 439 } 440 if (bp->b_bufsize > bp->b_kvasize) 441 panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 442 bp->b_bufsize, bp->b_kvasize); 443 bp->b_kvasize = bp->b_bufsize; 444 445 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 446 (vm_page_t *)bp->b_pages, bp->b_npages); 447 return (bp); 448} 449 450/* 451 * Cleanup after a clustered read or write. 452 * This is complicated by the fact that any of the buffers might have 453 * extra memory (if there were no empty buffer headers at allocbuf time) 454 * that we will need to shift around. 455 */ 456void 457cluster_callback(bp) 458 struct buf *bp; 459{ 460 struct buf *nbp, *tbp; 461 int error = 0; 462 463 /* 464 * Must propogate errors to all the components. 465 */ 466 if (bp->b_flags & B_ERROR) 467 error = bp->b_error; 468 469 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 470 /* 471 * Move memory from the large cluster buffer into the component 472 * buffers and mark IO as done on these. 473 */ 474 for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 475 tbp; tbp = nbp) { 476 nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 477 if (error) { 478 tbp->b_flags |= B_ERROR; 479 tbp->b_error = error; 480 } else 481 tbp->b_dirtyoff = tbp->b_dirtyend = 0; 482 biodone(tbp); 483 } 484 relpbuf(bp); 485} 486 487/* 488 * Do clustered write for FFS. 489 * 490 * Three cases: 491 * 1. Write is not sequential (write asynchronously) 492 * Write is sequential: 493 * 2. beginning of cluster - begin cluster 494 * 3. middle of a cluster - add to cluster 495 * 4. end of a cluster - asynchronously write cluster 496 */ 497void 498cluster_write(bp, filesize) 499 struct buf *bp; 500 u_quad_t filesize; 501{ 502 struct vnode *vp; 503 daddr_t lbn; 504 int maxclen, cursize; 505 int lblocksize; 506 int async; 507 508 vp = bp->b_vp; 509 if (vp->v_maxio == 0) 510 vp->v_maxio = DFLTPHYS; 511 if (vp->v_type == VREG) { 512 async = vp->v_mount->mnt_flag & MNT_ASYNC; 513 lblocksize = vp->v_mount->mnt_stat.f_iosize; 514 } else { 515 async = 0; 516 lblocksize = bp->b_bufsize; 517 } 518 lbn = bp->b_lblkno; 519 520#ifdef DIAGNOSTIC 521 if (bp->b_offset == NOOFFSET) 522 panic("cluster_write: no buffer offset"); 523#endif 524 525 /* Initialize vnode to beginning of file. */ 526 if (lbn == 0) 527 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 528 529 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 530 (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 531 maxclen = vp->v_maxio / lblocksize - 1; 532 if (vp->v_clen != 0) { 533 /* 534 * Next block is not sequential. 535 * 536 * If we are not writing at end of file, the process 537 * seeked to another point in the file since its last 538 * write, or we have reached our maximum cluster size, 539 * then push the previous cluster. Otherwise try 540 * reallocating to make it sequential. 541 */ 542 cursize = vp->v_lastw - vp->v_cstart + 1; 543 if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 544 lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 545 if (!async) 546 cluster_wbuild(vp, lblocksize, 547 vp->v_cstart, cursize); 548 } else { 549 struct buf **bpp, **endbp; 550 struct cluster_save *buflist; 551 552 buflist = cluster_collectbufs(vp, bp); 553 endbp = &buflist->bs_children 554 [buflist->bs_nchildren - 1]; 555 if (VOP_REALLOCBLKS(vp, buflist)) { 556 /* 557 * Failed, push the previous cluster. 558 */ 559 for (bpp = buflist->bs_children; 560 bpp < endbp; bpp++) 561 brelse(*bpp); 562 free(buflist, M_SEGMENT); 563 cluster_wbuild(vp, lblocksize, 564 vp->v_cstart, cursize); 565 } else { 566 /* 567 * Succeeded, keep building cluster. 568 */ 569 for (bpp = buflist->bs_children; 570 bpp <= endbp; bpp++) 571 bdwrite(*bpp); 572 free(buflist, M_SEGMENT); 573 vp->v_lastw = lbn; 574 vp->v_lasta = bp->b_blkno; 575 return; 576 } 577 } 578 } 579 /* 580 * Consider beginning a cluster. If at end of file, make 581 * cluster as large as possible, otherwise find size of 582 * existing cluster. 583 */ 584 if ((vp->v_type == VREG) && 585 ((u_quad_t) bp->b_offset + lblocksize) != filesize && 586 (bp->b_blkno == bp->b_lblkno) && 587 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 588 bp->b_blkno == -1)) { 589 bawrite(bp); 590 vp->v_clen = 0; 591 vp->v_lasta = bp->b_blkno; 592 vp->v_cstart = lbn + 1; 593 vp->v_lastw = lbn; 594 return; 595 } 596 vp->v_clen = maxclen; 597 if (!async && maxclen == 0) { /* I/O not contiguous */ 598 vp->v_cstart = lbn + 1; 599 bawrite(bp); 600 } else { /* Wait for rest of cluster */ 601 vp->v_cstart = lbn; 602 bdwrite(bp); 603 } 604 } else if (lbn == vp->v_cstart + vp->v_clen) { 605 /* 606 * At end of cluster, write it out. 607 */ 608 bdwrite(bp); 609 cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 610 vp->v_clen = 0; 611 vp->v_cstart = lbn + 1; 612 } else 613 /* 614 * In the middle of a cluster, so just delay the I/O for now. 615 */ 616 bdwrite(bp); 617 vp->v_lastw = lbn; 618 vp->v_lasta = bp->b_blkno; 619} 620 621 622/* 623 * This is an awful lot like cluster_rbuild...wish they could be combined. 624 * The last lbn argument is the current block on which I/O is being 625 * performed. Check to see that it doesn't fall in the middle of 626 * the current block (if last_bp == NULL). 627 */ 628int 629cluster_wbuild(vp, size, start_lbn, len) 630 struct vnode *vp; 631 long size; 632 daddr_t start_lbn; 633 int len; 634{ 635 struct buf *bp, *tbp; 636 int i, j, s; 637 int totalwritten = 0; 638 int dbsize = btodb(size); 639 640 if (vp->v_maxio == 0) 641 vp->v_maxio = DFLTPHYS; 642 while (len > 0) { 643 s = splbio(); 644 if (((tbp = gbincore(vp, start_lbn)) == NULL) || 645 ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { 646 ++start_lbn; 647 --len; 648 splx(s); 649 continue; 650 } 651 bremfree(tbp); 652 tbp->b_flags |= B_BUSY; 653 tbp->b_flags &= ~B_DONE; 654 splx(s); 655 656 /* 657 * Extra memory in the buffer, punt on this buffer. XXX we could 658 * handle this in most cases, but we would have to push the extra 659 * memory down to after our max possible cluster size and then 660 * potentially pull it back up if the cluster was terminated 661 * prematurely--too much hassle. 662 */ 663 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 664 (tbp->b_bcount != tbp->b_bufsize) || 665 (tbp->b_bcount != size) || 666 (len == 1) || 667 ((bp = trypbuf()) == NULL)) { 668 totalwritten += tbp->b_bufsize; 669 bawrite(tbp); 670 ++start_lbn; 671 --len; 672 continue; 673 } 674 675 /* 676 * We got a pbuf to make the cluster in. 677 * so initialise it. 678 */ 679 TAILQ_INIT(&bp->b_cluster.cluster_head); 680 bp->b_bcount = 0; 681 bp->b_bufsize = 0; 682 bp->b_npages = 0; 683 if (tbp->b_wcred != NOCRED) { 684 bp->b_wcred = tbp->b_wcred; 685 crhold(bp->b_wcred); 686 } 687 688 bp->b_blkno = tbp->b_blkno; 689 bp->b_lblkno = tbp->b_lblkno; 690 bp->b_offset = tbp->b_offset; 691 bp->b_data = (char *)((vm_offset_t)bp->b_data | 692 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 693 bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | 694 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 695 bp->b_iodone = cluster_callback; 696 pbgetvp(vp, bp); 697 /* 698 * From this location in the file, scan forward to see 699 * if there are buffers with adjacent data that need to 700 * be written as well. 701 */ 702 for (i = 0; i < len; ++i, ++start_lbn) { 703 if (i != 0) { /* If not the first buffer */ 704 s = splbio(); 705 /* 706 * If the adjacent data is not even in core it 707 * can't need to be written. 708 */ 709 if ((tbp = gbincore(vp, start_lbn)) == NULL) { 710 splx(s); 711 break; 712 } 713 714 /* 715 * If it IS in core, but has different 716 * characteristics, don't cluster with it. 717 */ 718 if ((tbp->b_flags & 719 (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY | 720 B_DELWRI | B_NEEDCOMMIT)) 721 != (B_DELWRI | B_CLUSTEROK | 722 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) { 723 splx(s); 724 break; 725 } 726 727 if (tbp->b_wcred != bp->b_wcred) { 728 splx(s); 729 break; 730 } 731 732 /* 733 * Check that the combined cluster 734 * would make sense with regard to pages 735 * and would not be too large 736 */ 737 if ((tbp->b_bcount != size) || 738 ((bp->b_blkno + (dbsize * i)) != 739 tbp->b_blkno) || 740 ((tbp->b_npages + bp->b_npages) > 741 (vp->v_maxio / PAGE_SIZE))) { 742 splx(s); 743 break; 744 } 745 /* 746 * Ok, it's passed all the tests, 747 * so remove it from the free list 748 * and mark it busy. We will use it. 749 */ 750 bremfree(tbp); 751 tbp->b_flags |= B_BUSY; 752 tbp->b_flags &= ~B_DONE; 753 splx(s); 754 } /* end of code for non-first buffers only */ 755 /* check for latent dependencies to be handled */ 756 if ((LIST_FIRST(&tbp->b_dep)) != NULL && 757 bioops.io_start) 758 (*bioops.io_start)(tbp); 759 /* 760 * If the IO is via the VM then we do some 761 * special VM hackery. (yuck) 762 */ 763 if (tbp->b_flags & B_VMIO) { 764 vm_page_t m; 765 766 if (i != 0) { /* if not first buffer */ 767 for (j = 0; j < tbp->b_npages; j += 1) { 768 m = tbp->b_pages[j]; 769 if (m->flags & PG_BUSY) 770 goto finishcluster; 771 } 772 } 773 774 for (j = 0; j < tbp->b_npages; j += 1) { 775 m = tbp->b_pages[j]; 776 vm_page_io_start(m); 777 vm_object_pip_add(m->object, 1); 778 if ((bp->b_npages == 0) || 779 (bp->b_pages[bp->b_npages - 1] != m)) { 780 bp->b_pages[bp->b_npages] = m; 781 bp->b_npages++; 782 } 783 } 784 } 785 bp->b_bcount += size; 786 bp->b_bufsize += size; 787 788 s = splbio(); 789 --numdirtybuffers; 790 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 791 tbp->b_flags |= B_ASYNC; 792 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 793 ++tbp->b_vp->v_numoutput; 794 splx(s); 795 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 796 tbp, b_cluster.cluster_entry); 797 } 798 finishcluster: 799 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 800 (vm_page_t *) bp->b_pages, bp->b_npages); 801 if (bp->b_bufsize > bp->b_kvasize) 802 panic( 803 "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 804 bp->b_bufsize, bp->b_kvasize); 805 bp->b_kvasize = bp->b_bufsize; 806 totalwritten += bp->b_bufsize; 807 bp->b_dirtyoff = 0; 808 bp->b_dirtyend = bp->b_bufsize; 809 bawrite(bp); 810 811 len -= i; 812 } 813 return totalwritten; 814} 815 816/* 817 * Collect together all the buffers in a cluster. 818 * Plus add one additional buffer. 819 */ 820static struct cluster_save * 821cluster_collectbufs(vp, last_bp) 822 struct vnode *vp; 823 struct buf *last_bp; 824{ 825 struct cluster_save *buflist; 826 struct buf *bp; 827 daddr_t lbn; 828 int i, len; 829 830 len = vp->v_lastw - vp->v_cstart + 1; 831 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 832 M_SEGMENT, M_WAITOK); 833 buflist->bs_nchildren = 0; 834 buflist->bs_children = (struct buf **) (buflist + 1); 835 for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 836 (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 837 buflist->bs_children[i] = bp; 838 if (bp->b_blkno == bp->b_lblkno) 839 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 840 NULL, NULL); 841 } 842 buflist->bs_children[i] = bp = last_bp; 843 if (bp->b_blkno == bp->b_lblkno) 844 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 845 NULL, NULL); 846 buflist->bs_nchildren = i + 1; 847 return (buflist); 848} 849