vfs_cluster.c revision 1542
1/*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 34 */ 35 36#include <sys/param.h> 37#include <sys/proc.h> 38#include <sys/buf.h> 39#include <sys/vnode.h> 40#include <sys/mount.h> 41#include <sys/trace.h> 42#include <sys/malloc.h> 43#include <sys/resourcevar.h> 44#include <libkern/libkern.h> 45 46#ifdef DEBUG 47#include <vm/vm.h> 48#include <sys/sysctl.h> 49int doreallocblks = 1; 50struct ctldebug debug13 = { "doreallocblks", &doreallocblks }; 51#else 52/* XXX for cluster_write */ 53#define doreallocblks 1 54#endif 55 56/* 57 * Local declarations 58 */ 59struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 60 daddr_t, long, int)); 61struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 62 daddr_t, daddr_t, long, int, long)); 63void cluster_wbuild __P((struct vnode *, struct buf *, long, 64 daddr_t, int, daddr_t)); 65struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 66 67#ifdef DIAGNOSTIC 68/* 69 * Set to 1 if reads of block zero should cause readahead to be done. 70 * Set to 0 treats a read of block zero as a non-sequential read. 71 * 72 * Setting to one assumes that most reads of block zero of files are due to 73 * sequential passes over the files (e.g. cat, sum) where additional blocks 74 * will soon be needed. Setting to zero assumes that the majority are 75 * surgical strikes to get particular info (e.g. size, file) where readahead 76 * blocks will not be used and, in fact, push out other potentially useful 77 * blocks from the cache. The former seems intuitive, but some quick tests 78 * showed that the latter performed better from a system-wide point of view. 79 */ 80int doclusterraz = 0; 81#define ISSEQREAD(vp, blk) \ 82 (((blk) != 0 || doclusterraz) && \ 83 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 84#else 85#define ISSEQREAD(vp, blk) \ 86 ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 87#endif 88 89/* 90 * This replaces bread. If this is a bread at the beginning of a file and 91 * lastr is 0, we assume this is the first read and we'll read up to two 92 * blocks if they are sequential. After that, we'll do regular read ahead 93 * in clustered chunks. 94 * 95 * There are 4 or 5 cases depending on how you count: 96 * Desired block is in the cache: 97 * 1 Not sequential access (0 I/Os). 98 * 2 Access is sequential, do read-ahead (1 ASYNC). 99 * Desired block is not in cache: 100 * 3 Not sequential access (1 SYNC). 101 * 4 Sequential access, next block is contiguous (1 SYNC). 102 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 103 * 104 * There are potentially two buffers that require I/O. 105 * bp is the block requested. 106 * rbp is the read-ahead block. 107 * If either is NULL, then you don't have to do the I/O. 108 */ 109cluster_read(vp, filesize, lblkno, size, cred, bpp) 110 struct vnode *vp; 111 u_quad_t filesize; 112 daddr_t lblkno; 113 long size; 114 struct ucred *cred; 115 struct buf **bpp; 116{ 117 struct buf *bp, *rbp; 118 daddr_t blkno, ioblkno; 119 long flags; 120 int error, num_ra, alreadyincore; 121 122#ifdef DIAGNOSTIC 123 if (size == 0) 124 panic("cluster_read: size = 0"); 125#endif 126 127 error = 0; 128 flags = B_READ; 129 *bpp = bp = getblk(vp, lblkno, size, 0, 0); 130 if (bp->b_flags & B_CACHE) { 131 /* 132 * Desired block is in cache; do any readahead ASYNC. 133 * Case 1, 2. 134 */ 135 trace(TR_BREADHIT, pack(vp, size), lblkno); 136 flags |= B_ASYNC; 137 ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); 138 alreadyincore = (int)incore(vp, ioblkno); 139 bp = NULL; 140 } else { 141 /* Block wasn't in cache, case 3, 4, 5. */ 142 trace(TR_BREADMISS, pack(vp, size), lblkno); 143 bp->b_flags |= B_READ; 144 ioblkno = lblkno; 145 alreadyincore = 0; 146 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 147 } 148 /* 149 * XXX 150 * Replace 1 with a window size based on some permutation of 151 * maxcontig and rot_delay. This will let you figure out how 152 * many blocks you should read-ahead (case 2, 4, 5). 153 * 154 * If the access isn't sequential, reset the window to 1. 155 * Note that a read to the same block is considered sequential. 156 * This catches the case where the file is being read sequentially, 157 * but at smaller than the filesystem block size. 158 */ 159 rbp = NULL; 160 if (!ISSEQREAD(vp, lblkno)) { 161 vp->v_ralen = 0; 162 vp->v_maxra = lblkno; 163 } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && 164 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 165 blkno != -1) { 166 /* 167 * Reading sequentially, and the next block is not in the 168 * cache. We are going to try reading ahead. 169 */ 170 if (num_ra) { 171 /* 172 * If our desired readahead block had been read 173 * in a previous readahead but is no longer in 174 * core, then we may be reading ahead too far 175 * or are not using our readahead very rapidly. 176 * In this case we scale back the window. 177 */ 178 if (!alreadyincore && ioblkno <= vp->v_maxra) 179 vp->v_ralen = max(vp->v_ralen >> 1, 1); 180 /* 181 * There are more sequential blocks than our current 182 * window allows, scale up. Ideally we want to get 183 * in sync with the filesystem maxcontig value. 184 */ 185 else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) 186 vp->v_ralen = vp->v_ralen ? 187 min(num_ra, vp->v_ralen << 1) : 1; 188 189 if (num_ra > vp->v_ralen) 190 num_ra = vp->v_ralen; 191 } 192 193 if (num_ra) /* case 2, 4 */ 194 rbp = cluster_rbuild(vp, filesize, 195 bp, ioblkno, blkno, size, num_ra, flags); 196 else if (ioblkno == lblkno) { 197 bp->b_blkno = blkno; 198 /* Case 5: check how many blocks to read ahead */ 199 ++ioblkno; 200 if ((ioblkno + 1) * size > filesize || 201 incore(vp, ioblkno) || (error = VOP_BMAP(vp, 202 ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) 203 goto skip_readahead; 204 /* 205 * Adjust readahead as above 206 */ 207 if (num_ra) { 208 if (!alreadyincore && ioblkno <= vp->v_maxra) 209 vp->v_ralen = max(vp->v_ralen >> 1, 1); 210 else if (num_ra > vp->v_ralen && 211 lblkno != vp->v_lastr) 212 vp->v_ralen = vp->v_ralen ? 213 min(num_ra,vp->v_ralen<<1) : 1; 214 if (num_ra > vp->v_ralen) 215 num_ra = vp->v_ralen; 216 } 217 flags |= B_ASYNC; 218 if (num_ra) 219 rbp = cluster_rbuild(vp, filesize, 220 NULL, ioblkno, blkno, size, num_ra, flags); 221 else { 222 rbp = getblk(vp, ioblkno, size, 0, 0); 223 rbp->b_flags |= flags; 224 rbp->b_blkno = blkno; 225 } 226 } else { 227 /* case 2; read ahead single block */ 228 rbp = getblk(vp, ioblkno, size, 0, 0); 229 rbp->b_flags |= flags; 230 rbp->b_blkno = blkno; 231 } 232 233 if (rbp == bp) /* case 4 */ 234 rbp = NULL; 235 else if (rbp) { /* case 2, 5 */ 236 trace(TR_BREADMISSRA, 237 pack(vp, (num_ra + 1) * size), ioblkno); 238 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 239 } 240 } 241 242 /* XXX Kirk, do we need to make sure the bp has creds? */ 243skip_readahead: 244 if (bp) 245 if (bp->b_flags & (B_DONE | B_DELWRI)) 246 panic("cluster_read: DONE bp"); 247 else 248 error = VOP_STRATEGY(bp); 249 250 if (rbp) 251 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 252 rbp->b_flags &= ~(B_ASYNC | B_READ); 253 brelse(rbp); 254 } else 255 (void) VOP_STRATEGY(rbp); 256 257 /* 258 * Recalculate our maximum readahead 259 */ 260 if (rbp == NULL) 261 rbp = bp; 262 if (rbp) 263 vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; 264 265 if (bp) 266 return(biowait(bp)); 267 return(error); 268} 269 270/* 271 * If blocks are contiguous on disk, use this to provide clustered 272 * read ahead. We will read as many blocks as possible sequentially 273 * and then parcel them up into logical blocks in the buffer hash table. 274 */ 275struct buf * 276cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 277 struct vnode *vp; 278 u_quad_t filesize; 279 struct buf *bp; 280 daddr_t lbn; 281 daddr_t blkno; 282 long size; 283 int run; 284 long flags; 285{ 286 struct cluster_save *b_save; 287 struct buf *tbp; 288 daddr_t bn; 289 int i, inc; 290 291#ifdef DIAGNOSTIC 292 if (size != vp->v_mount->mnt_stat.f_iosize) 293 panic("cluster_rbuild: size %d != filesize %d\n", 294 size, vp->v_mount->mnt_stat.f_iosize); 295#endif 296 if (size * (lbn + run + 1) > filesize) 297 --run; 298 if (run == 0) { 299 if (!bp) { 300 bp = getblk(vp, lbn, size, 0, 0); 301 bp->b_blkno = blkno; 302 bp->b_flags |= flags; 303 } 304 return(bp); 305 } 306 307 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 308 if (bp->b_flags & (B_DONE | B_DELWRI)) 309 return (bp); 310 311 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 312 M_SEGMENT, M_WAITOK); 313 b_save->bs_bufsize = b_save->bs_bcount = size; 314 b_save->bs_nchildren = 0; 315 b_save->bs_children = (struct buf **)(b_save + 1); 316 b_save->bs_saveaddr = bp->b_saveaddr; 317 bp->b_saveaddr = (caddr_t) b_save; 318 319 inc = btodb(size); 320 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 321 if (incore(vp, lbn + i)) { 322 if (i == 1) { 323 bp->b_saveaddr = b_save->bs_saveaddr; 324 bp->b_flags &= ~B_CALL; 325 bp->b_iodone = NULL; 326 allocbuf(bp, size); 327 free(b_save, M_SEGMENT); 328 } else 329 allocbuf(bp, size * i); 330 break; 331 } 332 tbp = getblk(vp, lbn + i, 0, 0, 0); 333 /* 334 * getblk may return some memory in the buffer if there were 335 * no empty buffers to shed it to. If there is currently 336 * memory in the buffer, we move it down size bytes to make 337 * room for the valid pages that cluster_callback will insert. 338 * We do this now so we don't have to do it at interrupt time 339 * in the callback routine. 340 */ 341 if (tbp->b_bufsize != 0) { 342 caddr_t bdata = (char *)tbp->b_data; 343 344 if (tbp->b_bufsize + size > MAXBSIZE) 345 panic("cluster_rbuild: too much memory"); 346 if (tbp->b_bufsize > size) { 347 /* 348 * XXX if the source and destination regions 349 * overlap we have to copy backward to avoid 350 * clobbering any valid pages (i.e. pagemove 351 * implementations typically can't handle 352 * overlap). 353 */ 354 bdata += tbp->b_bufsize; 355 while (bdata > (char *)tbp->b_data) { 356 bdata -= CLBYTES; 357 pagemove(bdata, bdata + size, CLBYTES); 358 } 359 } else 360 pagemove(bdata, bdata + size, tbp->b_bufsize); 361 } 362 tbp->b_blkno = bn; 363 tbp->b_flags |= flags | B_READ | B_ASYNC; 364 ++b_save->bs_nchildren; 365 b_save->bs_children[i - 1] = tbp; 366 } 367 return(bp); 368} 369 370/* 371 * Either get a new buffer or grow the existing one. 372 */ 373struct buf * 374cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 375 struct vnode *vp; 376 struct buf *bp; 377 long flags; 378 daddr_t blkno; 379 daddr_t lblkno; 380 long size; 381 int run; 382{ 383 if (!bp) { 384 bp = getblk(vp, lblkno, size, 0, 0); 385 if (bp->b_flags & (B_DONE | B_DELWRI)) { 386 bp->b_blkno = blkno; 387 return(bp); 388 } 389 } 390 allocbuf(bp, run * size); 391 bp->b_blkno = blkno; 392 bp->b_iodone = cluster_callback; 393 bp->b_flags |= flags | B_CALL; 394 return(bp); 395} 396 397/* 398 * Cleanup after a clustered read or write. 399 * This is complicated by the fact that any of the buffers might have 400 * extra memory (if there were no empty buffer headers at allocbuf time) 401 * that we will need to shift around. 402 */ 403void 404cluster_callback(bp) 405 struct buf *bp; 406{ 407 struct cluster_save *b_save; 408 struct buf **bpp, *tbp; 409 long bsize; 410 caddr_t cp; 411 int error = 0; 412 413 /* 414 * Must propogate errors to all the components. 415 */ 416 if (bp->b_flags & B_ERROR) 417 error = bp->b_error; 418 419 b_save = (struct cluster_save *)(bp->b_saveaddr); 420 bp->b_saveaddr = b_save->bs_saveaddr; 421 422 bsize = b_save->bs_bufsize; 423 cp = (char *)bp->b_data + bsize; 424 /* 425 * Move memory from the large cluster buffer into the component 426 * buffers and mark IO as done on these. 427 */ 428 for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 429 tbp = *bpp; 430 pagemove(cp, tbp->b_data, bsize); 431 tbp->b_bufsize += bsize; 432 tbp->b_bcount = bsize; 433 if (error) { 434 tbp->b_flags |= B_ERROR; 435 tbp->b_error = error; 436 } 437 biodone(tbp); 438 bp->b_bufsize -= bsize; 439 cp += bsize; 440 } 441 /* 442 * If there was excess memory in the cluster buffer, 443 * slide it up adjacent to the remaining valid data. 444 */ 445 if (bp->b_bufsize != bsize) { 446 if (bp->b_bufsize < bsize) 447 panic("cluster_callback: too little memory"); 448 pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); 449 } 450 bp->b_bcount = bsize; 451 bp->b_iodone = NULL; 452 free(b_save, M_SEGMENT); 453 if (bp->b_flags & B_ASYNC) 454 brelse(bp); 455 else { 456 bp->b_flags &= ~B_WANTED; 457 wakeup((caddr_t)bp); 458 } 459} 460 461/* 462 * Do clustered write for FFS. 463 * 464 * Three cases: 465 * 1. Write is not sequential (write asynchronously) 466 * Write is sequential: 467 * 2. beginning of cluster - begin cluster 468 * 3. middle of a cluster - add to cluster 469 * 4. end of a cluster - asynchronously write cluster 470 */ 471void 472cluster_write(bp, filesize) 473 struct buf *bp; 474 u_quad_t filesize; 475{ 476 struct vnode *vp; 477 daddr_t lbn; 478 int maxclen, cursize; 479 480 vp = bp->b_vp; 481 lbn = bp->b_lblkno; 482 483 /* Initialize vnode to beginning of file. */ 484 if (lbn == 0) 485 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 486 487 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 488 (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { 489 maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 490 if (vp->v_clen != 0) { 491 /* 492 * Next block is not sequential. 493 * 494 * If we are not writing at end of file, the process 495 * seeked to another point in the file since its 496 * last write, or we have reached our maximum 497 * cluster size, then push the previous cluster. 498 * Otherwise try reallocating to make it sequential. 499 */ 500 cursize = vp->v_lastw - vp->v_cstart + 1; 501 if (!doreallocblks || 502 (lbn + 1) * bp->b_bcount != filesize || 503 lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 504 cluster_wbuild(vp, NULL, bp->b_bcount, 505 vp->v_cstart, cursize, lbn); 506 } else { 507 struct buf **bpp, **endbp; 508 struct cluster_save *buflist; 509 510 buflist = cluster_collectbufs(vp, bp); 511 endbp = &buflist->bs_children 512 [buflist->bs_nchildren - 1]; 513 if (VOP_REALLOCBLKS(vp, buflist)) { 514 /* 515 * Failed, push the previous cluster. 516 */ 517 for (bpp = buflist->bs_children; 518 bpp < endbp; bpp++) 519 brelse(*bpp); 520 free(buflist, M_SEGMENT); 521 cluster_wbuild(vp, NULL, bp->b_bcount, 522 vp->v_cstart, cursize, lbn); 523 } else { 524 /* 525 * Succeeded, keep building cluster. 526 */ 527 for (bpp = buflist->bs_children; 528 bpp <= endbp; bpp++) 529 bdwrite(*bpp); 530 free(buflist, M_SEGMENT); 531 vp->v_lastw = lbn; 532 vp->v_lasta = bp->b_blkno; 533 return; 534 } 535 } 536 } 537 /* 538 * Consider beginning a cluster. 539 * If at end of file, make cluster as large as possible, 540 * otherwise find size of existing cluster. 541 */ 542 if ((lbn + 1) * bp->b_bcount != filesize && 543 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || 544 bp->b_blkno == -1)) { 545 bawrite(bp); 546 vp->v_clen = 0; 547 vp->v_lasta = bp->b_blkno; 548 vp->v_cstart = lbn + 1; 549 vp->v_lastw = lbn; 550 return; 551 } 552 vp->v_clen = maxclen; 553 if (maxclen == 0) { /* I/O not contiguous */ 554 vp->v_cstart = lbn + 1; 555 bawrite(bp); 556 } else { /* Wait for rest of cluster */ 557 vp->v_cstart = lbn; 558 bdwrite(bp); 559 } 560 } else if (lbn == vp->v_cstart + vp->v_clen) { 561 /* 562 * At end of cluster, write it out. 563 */ 564 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 565 vp->v_clen + 1, lbn); 566 vp->v_clen = 0; 567 vp->v_cstart = lbn + 1; 568 } else 569 /* 570 * In the middle of a cluster, so just delay the 571 * I/O for now. 572 */ 573 bdwrite(bp); 574 vp->v_lastw = lbn; 575 vp->v_lasta = bp->b_blkno; 576} 577 578 579/* 580 * This is an awful lot like cluster_rbuild...wish they could be combined. 581 * The last lbn argument is the current block on which I/O is being 582 * performed. Check to see that it doesn't fall in the middle of 583 * the current block (if last_bp == NULL). 584 */ 585void 586cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 587 struct vnode *vp; 588 struct buf *last_bp; 589 long size; 590 daddr_t start_lbn; 591 int len; 592 daddr_t lbn; 593{ 594 struct cluster_save *b_save; 595 struct buf *bp, *tbp; 596 caddr_t cp; 597 int i, s; 598 599#ifdef DIAGNOSTIC 600 if (size != vp->v_mount->mnt_stat.f_iosize) 601 panic("cluster_wbuild: size %d != filesize %d\n", 602 size, vp->v_mount->mnt_stat.f_iosize); 603#endif 604redo: 605 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 606 ++start_lbn; 607 --len; 608 } 609 610 /* Get more memory for current buffer */ 611 if (len <= 1) { 612 if (last_bp) { 613 bawrite(last_bp); 614 } else if (len) { 615 bp = getblk(vp, start_lbn, size, 0, 0); 616 bawrite(bp); 617 } 618 return; 619 } 620 621 bp = getblk(vp, start_lbn, size, 0, 0); 622 if (!(bp->b_flags & B_DELWRI)) { 623 ++start_lbn; 624 --len; 625 brelse(bp); 626 goto redo; 627 } 628 629 /* 630 * Extra memory in the buffer, punt on this buffer. 631 * XXX we could handle this in most cases, but we would have to 632 * push the extra memory down to after our max possible cluster 633 * size and then potentially pull it back up if the cluster was 634 * terminated prematurely--too much hassle. 635 */ 636 if (bp->b_bcount != bp->b_bufsize) { 637 ++start_lbn; 638 --len; 639 bawrite(bp); 640 goto redo; 641 } 642 643 --len; 644 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 645 M_SEGMENT, M_WAITOK); 646 b_save->bs_bcount = bp->b_bcount; 647 b_save->bs_bufsize = bp->b_bufsize; 648 b_save->bs_nchildren = 0; 649 b_save->bs_children = (struct buf **)(b_save + 1); 650 b_save->bs_saveaddr = bp->b_saveaddr; 651 bp->b_saveaddr = (caddr_t) b_save; 652 653 bp->b_flags |= B_CALL; 654 bp->b_iodone = cluster_callback; 655 cp = (char *)bp->b_data + size; 656 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 657 /* 658 * Block is not in core or the non-sequential block 659 * ending our cluster was part of the cluster (in which 660 * case we don't want to write it twice). 661 */ 662 if (!incore(vp, start_lbn) || 663 last_bp == NULL && start_lbn == lbn) 664 break; 665 666 /* 667 * Get the desired block buffer (unless it is the final 668 * sequential block whose buffer was passed in explictly 669 * as last_bp). 670 */ 671 if (last_bp == NULL || start_lbn != lbn) { 672 tbp = getblk(vp, start_lbn, size, 0, 0); 673 if (!(tbp->b_flags & B_DELWRI)) { 674 brelse(tbp); 675 break; 676 } 677 } else 678 tbp = last_bp; 679 680 ++b_save->bs_nchildren; 681 682 /* Move memory from children to parent */ 683 if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { 684 printf("Clustered Block: %d addr %x bufsize: %d\n", 685 bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 686 printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 687 tbp->b_blkno); 688 panic("Clustered write to wrong blocks"); 689 } 690 691 pagemove(tbp->b_data, cp, size); 692 bp->b_bcount += size; 693 bp->b_bufsize += size; 694 695 tbp->b_bufsize -= size; 696 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 697 tbp->b_flags |= (B_ASYNC | B_AGE); 698 s = splbio(); 699 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 700 ++tbp->b_vp->v_numoutput; 701 splx(s); 702 b_save->bs_children[i] = tbp; 703 704 cp += size; 705 } 706 707 if (i == 0) { 708 /* None to cluster */ 709 bp->b_saveaddr = b_save->bs_saveaddr; 710 bp->b_flags &= ~B_CALL; 711 bp->b_iodone = NULL; 712 free(b_save, M_SEGMENT); 713 } 714 bawrite(bp); 715 if (i < len) { 716 len -= i + 1; 717 start_lbn += 1; 718 goto redo; 719 } 720} 721 722/* 723 * Collect together all the buffers in a cluster. 724 * Plus add one additional buffer. 725 */ 726struct cluster_save * 727cluster_collectbufs(vp, last_bp) 728 struct vnode *vp; 729 struct buf *last_bp; 730{ 731 struct cluster_save *buflist; 732 daddr_t lbn; 733 int i, len; 734 735 len = vp->v_lastw - vp->v_cstart + 1; 736 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 737 M_SEGMENT, M_WAITOK); 738 buflist->bs_nchildren = 0; 739 buflist->bs_children = (struct buf **)(buflist + 1); 740 for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 741 (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, 742 &buflist->bs_children[i]); 743 buflist->bs_children[i] = last_bp; 744 buflist->bs_nchildren = i + 1; 745 return (buflist); 746} 747