vfs_cluster.c revision 47948
1/*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * Modifications/enhancements: 5 * Copyright (c) 1995 John S. Dyson. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 36 * $Id: vfs_cluster.c,v 1.81 1999/05/02 23:56:11 alc Exp $ 37 */ 38 39#include "opt_debug_cluster.h" 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/kernel.h> 44#include <sys/proc.h> 45#include <sys/buf.h> 46#include <sys/vnode.h> 47#include <sys/malloc.h> 48#include <sys/mount.h> 49#include <sys/resourcevar.h> 50#include <vm/vm.h> 51#include <vm/vm_prot.h> 52#include <vm/vm_object.h> 53#include <vm/vm_page.h> 54 55#if defined(CLUSTERDEBUG) 56#include <sys/sysctl.h> 57static int rcluster= 0; 58SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 59#endif 60 61static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 62 63static struct cluster_save * 64 cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 65static struct buf * 66 cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 67 daddr_t blkno, long size, int run, struct buf *fbp)); 68 69extern vm_page_t bogus_page; 70 71extern int cluster_pbuf_freecnt; 72 73/* 74 * Maximum number of blocks for read-ahead. 75 */ 76#define MAXRA 32 77 78/* 79 * This replaces bread. 80 */ 81int 82cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 83 struct vnode *vp; 84 u_quad_t filesize; 85 daddr_t lblkno; 86 long size; 87 struct ucred *cred; 88 long totread; 89 int seqcount; 90 struct buf **bpp; 91{ 92 struct buf *bp, *rbp, *reqbp; 93 daddr_t blkno, origblkno; 94 int error, num_ra; 95 int i; 96 int maxra, racluster; 97 long origtotread; 98 99 error = 0; 100 if (vp->v_maxio == 0) 101 vp->v_maxio = DFLTPHYS; 102 103 /* 104 * Try to limit the amount of read-ahead by a few 105 * ad-hoc parameters. This needs work!!! 106 */ 107 racluster = vp->v_maxio/size; 108 maxra = 2 * racluster + (totread / size); 109 if (maxra > MAXRA) 110 maxra = MAXRA; 111 if (maxra > nbuf/8) 112 maxra = nbuf/8; 113 114 /* 115 * get the requested block 116 */ 117 *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 118 origblkno = lblkno; 119 origtotread = totread; 120 121 /* 122 * if it is in the cache, then check to see if the reads have been 123 * sequential. If they have, then try some read-ahead, otherwise 124 * back-off on prospective read-aheads. 125 */ 126 if (bp->b_flags & B_CACHE) { 127 if (!seqcount) { 128 return 0; 129 } else if ((bp->b_flags & B_RAM) == 0) { 130 return 0; 131 } else { 132 int s; 133 struct buf *tbp; 134 bp->b_flags &= ~B_RAM; 135 /* 136 * We do the spl here so that there is no window 137 * between the incore and the b_usecount increment 138 * below. We opt to keep the spl out of the loop 139 * for efficiency. 140 */ 141 s = splbio(); 142 for(i=1;i<maxra;i++) { 143 144 if (!(tbp = incore(vp, lblkno+i))) { 145 break; 146 } 147 148 /* 149 * Set another read-ahead mark so we know to check 150 * again. 151 */ 152 if (((i % racluster) == (racluster - 1)) || 153 (i == (maxra - 1))) 154 tbp->b_flags |= B_RAM; 155 156 if ((tbp->b_usecount < 1) && 157 ((tbp->b_flags & B_BUSY) == 0) && 158 (tbp->b_qindex == QUEUE_LRU)) { 159 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist); 160 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist); 161 } 162 } 163 splx(s); 164 if (i >= maxra) { 165 return 0; 166 } 167 lblkno += i; 168 } 169 reqbp = bp = NULL; 170 } else { 171 off_t firstread = bp->b_offset; 172 173 KASSERT(bp->b_offset != NOOFFSET, 174 ("cluster_read: no buffer offset")); 175 if (firstread + totread > filesize) 176 totread = filesize - firstread; 177 if (totread > size) { 178 int nblks = 0; 179 int ncontigafter; 180 while (totread > 0) { 181 nblks++; 182 totread -= size; 183 } 184 if (nblks == 1) 185 goto single_block_read; 186 if (nblks > racluster) 187 nblks = racluster; 188 189 error = VOP_BMAP(vp, lblkno, NULL, 190 &blkno, &ncontigafter, NULL); 191 if (error) 192 goto single_block_read; 193 if (blkno == -1) 194 goto single_block_read; 195 if (ncontigafter == 0) 196 goto single_block_read; 197 if (ncontigafter + 1 < nblks) 198 nblks = ncontigafter + 1; 199 200 bp = cluster_rbuild(vp, filesize, lblkno, 201 blkno, size, nblks, bp); 202 lblkno += (bp->b_bufsize / size); 203 } else { 204single_block_read: 205 /* 206 * if it isn't in the cache, then get a chunk from 207 * disk if sequential, otherwise just get the block. 208 */ 209 bp->b_flags |= B_READ | B_RAM; 210 lblkno += 1; 211 } 212 } 213 214 /* 215 * if we have been doing sequential I/O, then do some read-ahead 216 */ 217 rbp = NULL; 218 if (seqcount && (lblkno < (origblkno + seqcount))) { 219 /* 220 * we now build the read-ahead buffer if it is desirable. 221 */ 222 if (((u_quad_t)(lblkno + 1) * size) <= filesize && 223 !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 224 blkno != -1) { 225 int nblksread; 226 int ntoread = num_ra + 1; 227 nblksread = (origtotread + size - 1) / size; 228 if (seqcount < nblksread) 229 seqcount = nblksread; 230 if (seqcount < ntoread) 231 ntoread = seqcount; 232 if (num_ra) { 233 rbp = cluster_rbuild(vp, filesize, lblkno, 234 blkno, size, ntoread, NULL); 235 } else { 236 rbp = getblk(vp, lblkno, size, 0, 0); 237 rbp->b_flags |= B_READ | B_ASYNC | B_RAM; 238 rbp->b_blkno = blkno; 239 } 240 } 241 } 242 243 /* 244 * handle the synchronous read 245 */ 246 if (bp) { 247#if defined(CLUSTERDEBUG) 248 if (rcluster) 249 printf("S(%ld,%ld,%d) ", 250 (long)bp->b_lblkno, bp->b_bcount, seqcount); 251#endif 252 if ((bp->b_flags & B_CLUSTER) == 0) 253 vfs_busy_pages(bp, 0); 254 bp->b_flags &= ~(B_ERROR|B_INVAL); 255 error = VOP_STRATEGY(vp, bp); 256 curproc->p_stats->p_ru.ru_inblock++; 257 } 258 259 /* 260 * and if we have read-aheads, do them too 261 */ 262 if (rbp) { 263 if (error) { 264 rbp->b_flags &= ~(B_ASYNC | B_READ); 265 brelse(rbp); 266 } else if (rbp->b_flags & B_CACHE) { 267 rbp->b_flags &= ~(B_ASYNC | B_READ); 268 bqrelse(rbp); 269 } else { 270#if defined(CLUSTERDEBUG) 271 if (rcluster) { 272 if (bp) 273 printf("A+(%ld,%ld,%ld,%d) ", 274 (long)rbp->b_lblkno, rbp->b_bcount, 275 (long)(rbp->b_lblkno - origblkno), 276 seqcount); 277 else 278 printf("A(%ld,%ld,%ld,%d) ", 279 (long)rbp->b_lblkno, rbp->b_bcount, 280 (long)(rbp->b_lblkno - origblkno), 281 seqcount); 282 } 283#endif 284 285 if ((rbp->b_flags & B_CLUSTER) == 0) 286 vfs_busy_pages(rbp, 0); 287 rbp->b_flags &= ~(B_ERROR|B_INVAL); 288 (void) VOP_STRATEGY(vp, rbp); 289 curproc->p_stats->p_ru.ru_inblock++; 290 } 291 } 292 if (reqbp) 293 return (biowait(reqbp)); 294 else 295 return (error); 296} 297 298/* 299 * If blocks are contiguous on disk, use this to provide clustered 300 * read ahead. We will read as many blocks as possible sequentially 301 * and then parcel them up into logical blocks in the buffer hash table. 302 */ 303static struct buf * 304cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 305 struct vnode *vp; 306 u_quad_t filesize; 307 daddr_t lbn; 308 daddr_t blkno; 309 long size; 310 int run; 311 struct buf *fbp; 312{ 313 struct buf *bp, *tbp; 314 daddr_t bn; 315 int i, inc, j; 316 317 KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 318 ("cluster_rbuild: size %ld != filesize %ld\n", 319 size, vp->v_mount->mnt_stat.f_iosize)); 320 321 /* 322 * avoid a division 323 */ 324 while ((u_quad_t) size * (lbn + run) > filesize) { 325 --run; 326 } 327 328 if (fbp) { 329 tbp = fbp; 330 tbp->b_flags |= B_READ; 331 } else { 332 tbp = getblk(vp, lbn, size, 0, 0); 333 if (tbp->b_flags & B_CACHE) 334 return tbp; 335 tbp->b_flags |= B_ASYNC | B_READ | B_RAM; 336 } 337 338 tbp->b_blkno = blkno; 339 if( (tbp->b_flags & B_MALLOC) || 340 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 341 return tbp; 342 343 bp = trypbuf(&cluster_pbuf_freecnt); 344 if (bp == 0) 345 return tbp; 346 347 bp->b_data = (char *)((vm_offset_t)bp->b_data | 348 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 349 bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 350 bp->b_iodone = cluster_callback; 351 bp->b_blkno = blkno; 352 bp->b_lblkno = lbn; 353 bp->b_offset = tbp->b_offset; 354 KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 355 pbgetvp(vp, bp); 356 357 TAILQ_INIT(&bp->b_cluster.cluster_head); 358 359 bp->b_bcount = 0; 360 bp->b_bufsize = 0; 361 bp->b_npages = 0; 362 363 if (vp->v_maxio == 0) 364 vp->v_maxio = DFLTPHYS; 365 inc = btodb(size); 366 for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 367 if (i != 0) { 368 if ((bp->b_npages * PAGE_SIZE) + 369 round_page(size) > vp->v_maxio) 370 break; 371 372 if ((tbp = incore(vp, lbn + i)) != NULL) { 373 if (tbp->b_flags & B_BUSY) 374 break; 375 376 for (j = 0; j < tbp->b_npages; j++) 377 if (tbp->b_pages[j]->valid) 378 break; 379 380 if (j != tbp->b_npages) 381 break; 382 383 if (tbp->b_bcount != size) 384 break; 385 } 386 387 tbp = getblk(vp, lbn + i, size, 0, 0); 388 389 if ((tbp->b_flags & B_CACHE) || 390 (tbp->b_flags & B_VMIO) == 0) { 391 bqrelse(tbp); 392 break; 393 } 394 395 for (j = 0;j < tbp->b_npages; j++) 396 if (tbp->b_pages[j]->valid) 397 break; 398 399 if (j != tbp->b_npages) { 400 bqrelse(tbp); 401 break; 402 } 403 404 if ((fbp && (i == 1)) || (i == (run - 1))) 405 tbp->b_flags |= B_RAM; 406 tbp->b_flags |= B_READ | B_ASYNC; 407 if (tbp->b_blkno == tbp->b_lblkno) { 408 tbp->b_blkno = bn; 409 } else if (tbp->b_blkno != bn) { 410 brelse(tbp); 411 break; 412 } 413 } 414 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 415 tbp, b_cluster.cluster_entry); 416 for (j = 0; j < tbp->b_npages; j += 1) { 417 vm_page_t m; 418 m = tbp->b_pages[j]; 419 vm_page_io_start(m); 420 vm_object_pip_add(m->object, 1); 421 if ((bp->b_npages == 0) || 422 (bp->b_pages[bp->b_npages-1] != m)) { 423 bp->b_pages[bp->b_npages] = m; 424 bp->b_npages++; 425 } 426 if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 427 tbp->b_pages[j] = bogus_page; 428 } 429 bp->b_bcount += tbp->b_bcount; 430 bp->b_bufsize += tbp->b_bufsize; 431 } 432 433 for(j=0;j<bp->b_npages;j++) { 434 if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 435 VM_PAGE_BITS_ALL) 436 bp->b_pages[j] = bogus_page; 437 } 438 if (bp->b_bufsize > bp->b_kvasize) 439 panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 440 bp->b_bufsize, bp->b_kvasize); 441 bp->b_kvasize = bp->b_bufsize; 442 443 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 444 (vm_page_t *)bp->b_pages, bp->b_npages); 445 return (bp); 446} 447 448/* 449 * Cleanup after a clustered read or write. 450 * This is complicated by the fact that any of the buffers might have 451 * extra memory (if there were no empty buffer headers at allocbuf time) 452 * that we will need to shift around. 453 */ 454void 455cluster_callback(bp) 456 struct buf *bp; 457{ 458 struct buf *nbp, *tbp; 459 int error = 0; 460 461 /* 462 * Must propogate errors to all the components. 463 */ 464 if (bp->b_flags & B_ERROR) 465 error = bp->b_error; 466 467 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 468 /* 469 * Move memory from the large cluster buffer into the component 470 * buffers and mark IO as done on these. 471 */ 472 for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 473 tbp; tbp = nbp) { 474 nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 475 if (error) { 476 tbp->b_flags |= B_ERROR; 477 tbp->b_error = error; 478 } else { 479 tbp->b_dirtyoff = tbp->b_dirtyend = 0; 480 tbp->b_flags &= ~(B_ERROR|B_INVAL); 481 } 482 biodone(tbp); 483 } 484 relpbuf(bp, &cluster_pbuf_freecnt); 485} 486 487/* 488 * Do clustered write for FFS. 489 * 490 * Three cases: 491 * 1. Write is not sequential (write asynchronously) 492 * Write is sequential: 493 * 2. beginning of cluster - begin cluster 494 * 3. middle of a cluster - add to cluster 495 * 4. end of a cluster - asynchronously write cluster 496 */ 497void 498cluster_write(bp, filesize) 499 struct buf *bp; 500 u_quad_t filesize; 501{ 502 struct vnode *vp; 503 daddr_t lbn; 504 int maxclen, cursize; 505 int lblocksize; 506 int async; 507 508 vp = bp->b_vp; 509 if (vp->v_maxio == 0) 510 vp->v_maxio = DFLTPHYS; 511 if (vp->v_type == VREG) { 512 async = vp->v_mount->mnt_flag & MNT_ASYNC; 513 lblocksize = vp->v_mount->mnt_stat.f_iosize; 514 } else { 515 async = 0; 516 lblocksize = bp->b_bufsize; 517 } 518 lbn = bp->b_lblkno; 519 KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 520 521 /* Initialize vnode to beginning of file. */ 522 if (lbn == 0) 523 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 524 525 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 526 (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 527 maxclen = vp->v_maxio / lblocksize - 1; 528 if (vp->v_clen != 0) { 529 /* 530 * Next block is not sequential. 531 * 532 * If we are not writing at end of file, the process 533 * seeked to another point in the file since its last 534 * write, or we have reached our maximum cluster size, 535 * then push the previous cluster. Otherwise try 536 * reallocating to make it sequential. 537 */ 538 cursize = vp->v_lastw - vp->v_cstart + 1; 539 if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 540 lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 541 if (!async) 542 cluster_wbuild(vp, lblocksize, 543 vp->v_cstart, cursize); 544 } else { 545 struct buf **bpp, **endbp; 546 struct cluster_save *buflist; 547 548 buflist = cluster_collectbufs(vp, bp); 549 endbp = &buflist->bs_children 550 [buflist->bs_nchildren - 1]; 551 if (VOP_REALLOCBLKS(vp, buflist)) { 552 /* 553 * Failed, push the previous cluster. 554 */ 555 for (bpp = buflist->bs_children; 556 bpp < endbp; bpp++) 557 brelse(*bpp); 558 free(buflist, M_SEGMENT); 559 cluster_wbuild(vp, lblocksize, 560 vp->v_cstart, cursize); 561 } else { 562 /* 563 * Succeeded, keep building cluster. 564 */ 565 for (bpp = buflist->bs_children; 566 bpp <= endbp; bpp++) 567 bdwrite(*bpp); 568 free(buflist, M_SEGMENT); 569 vp->v_lastw = lbn; 570 vp->v_lasta = bp->b_blkno; 571 return; 572 } 573 } 574 } 575 /* 576 * Consider beginning a cluster. If at end of file, make 577 * cluster as large as possible, otherwise find size of 578 * existing cluster. 579 */ 580 if ((vp->v_type == VREG) && 581 ((u_quad_t) bp->b_offset + lblocksize) != filesize && 582 (bp->b_blkno == bp->b_lblkno) && 583 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 584 bp->b_blkno == -1)) { 585 bawrite(bp); 586 vp->v_clen = 0; 587 vp->v_lasta = bp->b_blkno; 588 vp->v_cstart = lbn + 1; 589 vp->v_lastw = lbn; 590 return; 591 } 592 vp->v_clen = maxclen; 593 if (!async && maxclen == 0) { /* I/O not contiguous */ 594 vp->v_cstart = lbn + 1; 595 bawrite(bp); 596 } else { /* Wait for rest of cluster */ 597 vp->v_cstart = lbn; 598 bdwrite(bp); 599 } 600 } else if (lbn == vp->v_cstart + vp->v_clen) { 601 /* 602 * At end of cluster, write it out. 603 */ 604 bdwrite(bp); 605 cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 606 vp->v_clen = 0; 607 vp->v_cstart = lbn + 1; 608 } else 609 /* 610 * In the middle of a cluster, so just delay the I/O for now. 611 */ 612 bdwrite(bp); 613 vp->v_lastw = lbn; 614 vp->v_lasta = bp->b_blkno; 615} 616 617 618/* 619 * This is an awful lot like cluster_rbuild...wish they could be combined. 620 * The last lbn argument is the current block on which I/O is being 621 * performed. Check to see that it doesn't fall in the middle of 622 * the current block (if last_bp == NULL). 623 */ 624int 625cluster_wbuild(vp, size, start_lbn, len) 626 struct vnode *vp; 627 long size; 628 daddr_t start_lbn; 629 int len; 630{ 631 struct buf *bp, *tbp; 632 int i, j, s; 633 int totalwritten = 0; 634 int dbsize = btodb(size); 635 636 if (vp->v_maxio == 0) 637 vp->v_maxio = DFLTPHYS; 638 while (len > 0) { 639 s = splbio(); 640 if (((tbp = gbincore(vp, start_lbn)) == NULL) || 641 ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { 642 ++start_lbn; 643 --len; 644 splx(s); 645 continue; 646 } 647 bremfree(tbp); 648 tbp->b_flags |= B_BUSY; 649 tbp->b_flags &= ~B_DONE; 650 splx(s); 651 652 /* 653 * Extra memory in the buffer, punt on this buffer. XXX we could 654 * handle this in most cases, but we would have to push the extra 655 * memory down to after our max possible cluster size and then 656 * potentially pull it back up if the cluster was terminated 657 * prematurely--too much hassle. 658 */ 659 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 660 (tbp->b_bcount != tbp->b_bufsize) || 661 (tbp->b_bcount != size) || 662 (len == 1) || 663 ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { 664 totalwritten += tbp->b_bufsize; 665 bawrite(tbp); 666 ++start_lbn; 667 --len; 668 continue; 669 } 670 671 /* 672 * We got a pbuf to make the cluster in. 673 * so initialise it. 674 */ 675 TAILQ_INIT(&bp->b_cluster.cluster_head); 676 bp->b_bcount = 0; 677 bp->b_bufsize = 0; 678 bp->b_npages = 0; 679 if (tbp->b_wcred != NOCRED) { 680 bp->b_wcred = tbp->b_wcred; 681 crhold(bp->b_wcred); 682 } 683 684 bp->b_blkno = tbp->b_blkno; 685 bp->b_lblkno = tbp->b_lblkno; 686 bp->b_offset = tbp->b_offset; 687 bp->b_data = (char *)((vm_offset_t)bp->b_data | 688 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 689 bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | 690 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 691 bp->b_iodone = cluster_callback; 692 pbgetvp(vp, bp); 693 /* 694 * From this location in the file, scan forward to see 695 * if there are buffers with adjacent data that need to 696 * be written as well. 697 */ 698 for (i = 0; i < len; ++i, ++start_lbn) { 699 if (i != 0) { /* If not the first buffer */ 700 s = splbio(); 701 /* 702 * If the adjacent data is not even in core it 703 * can't need to be written. 704 */ 705 if ((tbp = gbincore(vp, start_lbn)) == NULL) { 706 splx(s); 707 break; 708 } 709 710 /* 711 * If it IS in core, but has different 712 * characteristics, don't cluster with it. 713 */ 714 if ((tbp->b_flags & 715 (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY | 716 B_DELWRI | B_NEEDCOMMIT)) 717 != (B_DELWRI | B_CLUSTEROK | 718 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) { 719 splx(s); 720 break; 721 } 722 723 if (tbp->b_wcred != bp->b_wcred) { 724 splx(s); 725 break; 726 } 727 728 /* 729 * Check that the combined cluster 730 * would make sense with regard to pages 731 * and would not be too large 732 */ 733 if ((tbp->b_bcount != size) || 734 ((bp->b_blkno + (dbsize * i)) != 735 tbp->b_blkno) || 736 ((tbp->b_npages + bp->b_npages) > 737 (vp->v_maxio / PAGE_SIZE))) { 738 splx(s); 739 break; 740 } 741 /* 742 * Ok, it's passed all the tests, 743 * so remove it from the free list 744 * and mark it busy. We will use it. 745 */ 746 bremfree(tbp); 747 tbp->b_flags |= B_BUSY; 748 tbp->b_flags &= ~B_DONE; 749 splx(s); 750 } /* end of code for non-first buffers only */ 751 /* check for latent dependencies to be handled */ 752 if ((LIST_FIRST(&tbp->b_dep)) != NULL && 753 bioops.io_start) 754 (*bioops.io_start)(tbp); 755 /* 756 * If the IO is via the VM then we do some 757 * special VM hackery. (yuck) 758 */ 759 if (tbp->b_flags & B_VMIO) { 760 vm_page_t m; 761 762 if (i != 0) { /* if not first buffer */ 763 for (j = 0; j < tbp->b_npages; j += 1) { 764 m = tbp->b_pages[j]; 765 if (m->flags & PG_BUSY) 766 goto finishcluster; 767 } 768 } 769 770 for (j = 0; j < tbp->b_npages; j += 1) { 771 m = tbp->b_pages[j]; 772 vm_page_io_start(m); 773 vm_object_pip_add(m->object, 1); 774 if ((bp->b_npages == 0) || 775 (bp->b_pages[bp->b_npages - 1] != m)) { 776 bp->b_pages[bp->b_npages] = m; 777 bp->b_npages++; 778 } 779 } 780 } 781 bp->b_bcount += size; 782 bp->b_bufsize += size; 783 784 s = splbio(); 785 bundirty(tbp); 786 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR); 787 tbp->b_flags |= B_ASYNC; 788 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 789 ++tbp->b_vp->v_numoutput; 790 splx(s); 791 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 792 tbp, b_cluster.cluster_entry); 793 } 794 finishcluster: 795 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 796 (vm_page_t *) bp->b_pages, bp->b_npages); 797 if (bp->b_bufsize > bp->b_kvasize) 798 panic( 799 "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 800 bp->b_bufsize, bp->b_kvasize); 801 bp->b_kvasize = bp->b_bufsize; 802 totalwritten += bp->b_bufsize; 803 bp->b_dirtyoff = 0; 804 bp->b_dirtyend = bp->b_bufsize; 805 bawrite(bp); 806 807 len -= i; 808 } 809 return totalwritten; 810} 811 812/* 813 * Collect together all the buffers in a cluster. 814 * Plus add one additional buffer. 815 */ 816static struct cluster_save * 817cluster_collectbufs(vp, last_bp) 818 struct vnode *vp; 819 struct buf *last_bp; 820{ 821 struct cluster_save *buflist; 822 struct buf *bp; 823 daddr_t lbn; 824 int i, len; 825 826 len = vp->v_lastw - vp->v_cstart + 1; 827 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 828 M_SEGMENT, M_WAITOK); 829 buflist->bs_nchildren = 0; 830 buflist->bs_children = (struct buf **) (buflist + 1); 831 for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 832 (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 833 buflist->bs_children[i] = bp; 834 if (bp->b_blkno == bp->b_lblkno) 835 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 836 NULL, NULL); 837 } 838 buflist->bs_children[i] = bp = last_bp; 839 if (bp->b_blkno == bp->b_lblkno) 840 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 841 NULL, NULL); 842 buflist->bs_nchildren = i + 1; 843 return (buflist); 844} 845