vfs_bio.c revision 3688
1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 * 19 * $Id: vfs_bio.c,v 1.14 1994/10/05 09:48:21 davidg Exp $ 20 */ 21 22#include <sys/param.h> 23#include <sys/systm.h> 24#include <sys/kernel.h> 25#include <sys/proc.h> 26#include <sys/vnode.h> 27#include <sys/buf.h> 28#include <sys/mount.h> 29#include <sys/malloc.h> 30#include <sys/resourcevar.h> 31#include <sys/proc.h> 32#include <vm/vm.h> 33#include <vm/vm_pageout.h> 34 35#include <miscfs/specfs/specdev.h> 36 37struct buf *buf; /* buffer header pool */ 38int nbuf; /* number of buffer headers calculated elsewhere */ 39struct swqueue bswlist; 40 41extern vm_map_t buffer_map, io_map; 42 43void vm_hold_free_pages(vm_offset_t from, vm_offset_t to); 44void vm_hold_load_pages(vm_offset_t from, vm_offset_t to); 45 46int needsbuffer; 47 48/* 49 * Internal update daemon, process 3 50 * The variable vfs_update_wakeup allows for internal syncs. 51 */ 52int vfs_update_wakeup; 53 54/* 55 * Initialize buffer headers and related structures. 56 */ 57void 58bufinit() 59{ 60 struct buf *bp; 61 int i; 62 caddr_t baddr; 63 64 TAILQ_INIT(&bswlist); 65 LIST_INIT(&invalhash); 66 67 /* first, make a null hash table */ 68 for(i=0;i<BUFHSZ;i++) 69 LIST_INIT(&bufhashtbl[i]); 70 71 /* next, make a null set of free lists */ 72 for(i=0;i<BUFFER_QUEUES;i++) 73 TAILQ_INIT(&bufqueues[i]); 74 75 baddr = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); 76 /* finally, initialize each buffer header and stick on empty q */ 77 for(i=0;i<nbuf;i++) { 78 bp = &buf[i]; 79 bzero(bp, sizeof *bp); 80 bp->b_flags = B_INVAL; /* we're just an empty header */ 81 bp->b_dev = NODEV; 82 bp->b_vp = NULL; 83 bp->b_rcred = NOCRED; 84 bp->b_wcred = NOCRED; 85 bp->b_qindex = QUEUE_EMPTY; 86 bp->b_vnbufs.le_next = NOLIST; 87 bp->b_data = baddr + i * MAXBSIZE; 88 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 89 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 90 } 91} 92 93/* 94 * remove the buffer from the appropriate free list 95 */ 96void 97bremfree(struct buf *bp) 98{ 99 int s = splbio(); 100 if( bp->b_qindex != QUEUE_NONE) { 101 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 102 bp->b_qindex = QUEUE_NONE; 103 } else { 104 panic("bremfree: removing a buffer when not on a queue"); 105 } 106 splx(s); 107} 108 109/* 110 * Get a buffer with the specified data. Look in the cache first. 111 */ 112int 113bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred, 114 struct buf **bpp) 115{ 116 struct buf *bp; 117 118 bp = getblk (vp, blkno, size, 0, 0); 119 *bpp = bp; 120 121 /* if not found in cache, do some I/O */ 122 if ((bp->b_flags & B_CACHE) == 0) { 123 if (curproc && curproc->p_stats) /* count block I/O */ 124 curproc->p_stats->p_ru.ru_inblock++; 125 bp->b_flags |= B_READ; 126 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 127 if( bp->b_rcred == NOCRED) { 128 if (cred != NOCRED) 129 crhold(cred); 130 bp->b_rcred = cred; 131 } 132 VOP_STRATEGY(bp); 133 return( biowait (bp)); 134 } 135 136 return (0); 137} 138 139/* 140 * Operates like bread, but also starts asynchronous I/O on 141 * read-ahead blocks. 142 */ 143int 144breadn(struct vnode *vp, daddr_t blkno, int size, 145 daddr_t *rablkno, int *rabsize, 146 int cnt, struct ucred *cred, struct buf **bpp) 147{ 148 struct buf *bp, *rabp; 149 int i; 150 int rv = 0, readwait = 0; 151 152 *bpp = bp = getblk (vp, blkno, size, 0, 0); 153 154 /* if not found in cache, do some I/O */ 155 if ((bp->b_flags & B_CACHE) == 0) { 156 if (curproc && curproc->p_stats) /* count block I/O */ 157 curproc->p_stats->p_ru.ru_inblock++; 158 bp->b_flags |= B_READ; 159 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 160 if( bp->b_rcred == NOCRED) { 161 if (cred != NOCRED) 162 crhold(cred); 163 bp->b_rcred = cred; 164 } 165 VOP_STRATEGY(bp); 166 ++readwait; 167 } 168 169 for(i=0;i<cnt;i++, rablkno++, rabsize++) { 170 if( incore(vp, *rablkno)) { 171 continue; 172 } 173 rabp = getblk (vp, *rablkno, *rabsize, 0, 0); 174 175 if ((rabp->b_flags & B_CACHE) == 0) { 176 if (curproc && curproc->p_stats) 177 curproc->p_stats->p_ru.ru_inblock++; 178 rabp->b_flags |= B_READ | B_ASYNC; 179 rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 180 if( rabp->b_rcred == NOCRED) { 181 if (cred != NOCRED) 182 crhold(cred); 183 rabp->b_rcred = cred; 184 } 185 VOP_STRATEGY(rabp); 186 } else { 187 brelse(rabp); 188 } 189 } 190 191 if( readwait) { 192 rv = biowait (bp); 193 } 194 195 return (rv); 196} 197 198/* 199 * Write, release buffer on completion. (Done by iodone 200 * if async.) 201 */ 202int 203bwrite(struct buf *bp) 204{ 205 int oldflags = bp->b_flags; 206 207 if(bp->b_flags & B_INVAL) { 208 brelse(bp); 209 return (0); 210 } 211 212 if(!(bp->b_flags & B_BUSY)) 213 panic("bwrite: buffer is not busy???"); 214 215 bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); 216 bp->b_flags |= B_WRITEINPROG; 217 218 if (oldflags & B_ASYNC) { 219 if (oldflags & B_DELWRI) { 220 reassignbuf(bp, bp->b_vp); 221 } else if( curproc) { 222 ++curproc->p_stats->p_ru.ru_oublock; 223 } 224 } 225 226 bp->b_vp->v_numoutput++; 227 VOP_STRATEGY(bp); 228 229 if( (oldflags & B_ASYNC) == 0) { 230 int rtval = biowait(bp); 231 if (oldflags & B_DELWRI) { 232 reassignbuf(bp, bp->b_vp); 233 } else if( curproc) { 234 ++curproc->p_stats->p_ru.ru_oublock; 235 } 236 brelse(bp); 237 return (rtval); 238 } 239 240 return(0); 241} 242 243int 244vn_bwrite(ap) 245 struct vop_bwrite_args *ap; 246{ 247 return (bwrite(ap->a_bp)); 248} 249 250/* 251 * Delayed write. (Buffer is marked dirty). 252 */ 253void 254bdwrite(struct buf *bp) 255{ 256 257 if((bp->b_flags & B_BUSY) == 0) { 258 panic("bdwrite: buffer is not busy"); 259 } 260 261 if(bp->b_flags & B_INVAL) { 262 brelse(bp); 263 return; 264 } 265 266 if(bp->b_flags & B_TAPE) { 267 bawrite(bp); 268 return; 269 } 270 271 bp->b_flags &= ~B_READ; 272 if( (bp->b_flags & B_DELWRI) == 0) { 273 if( curproc) 274 ++curproc->p_stats->p_ru.ru_oublock; 275 bp->b_flags |= B_DONE|B_DELWRI; 276 reassignbuf(bp, bp->b_vp); 277 } 278 brelse(bp); 279 return; 280} 281 282/* 283 * Asynchronous write. 284 * Start output on a buffer, but do not wait for it to complete. 285 * The buffer is released when the output completes. 286 */ 287void 288bawrite(struct buf *bp) 289{ 290 bp->b_flags |= B_ASYNC; 291 (void) bwrite(bp); 292} 293 294/* 295 * Release a buffer. 296 */ 297void 298brelse(struct buf *bp) 299{ 300 int x; 301 302 /* anyone need a "free" block? */ 303 x=splbio(); 304 if (needsbuffer) { 305 needsbuffer = 0; 306 wakeup((caddr_t)&needsbuffer); 307 } 308 309 /* anyone need this block? */ 310 if (bp->b_flags & B_WANTED) { 311 bp->b_flags &= ~(B_WANTED|B_AGE); 312 wakeup((caddr_t)bp); 313 } 314 315 if (bp->b_flags & B_LOCKED) 316 bp->b_flags &= ~B_ERROR; 317 318 if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) || 319 (bp->b_bufsize <= 0)) { 320 bp->b_flags |= B_INVAL; 321 bp->b_flags &= ~(B_DELWRI|B_CACHE); 322 if(bp->b_vp) 323 brelvp(bp); 324 } 325 326 if( bp->b_qindex != QUEUE_NONE) 327 panic("brelse: free buffer onto another queue???"); 328 329 /* enqueue */ 330 /* buffers with no memory */ 331 if(bp->b_bufsize == 0) { 332 bp->b_qindex = QUEUE_EMPTY; 333 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 334 LIST_REMOVE(bp, b_hash); 335 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 336 bp->b_dev = NODEV; 337 /* buffers with junk contents */ 338 } else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) { 339 bp->b_qindex = QUEUE_AGE; 340 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 341 LIST_REMOVE(bp, b_hash); 342 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 343 bp->b_dev = NODEV; 344 /* buffers that are locked */ 345 } else if(bp->b_flags & B_LOCKED) { 346 bp->b_qindex = QUEUE_LOCKED; 347 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 348 /* buffers with stale but valid contents */ 349 } else if(bp->b_flags & B_AGE) { 350 bp->b_qindex = QUEUE_AGE; 351 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 352 /* buffers with valid and quite potentially reuseable contents */ 353 } else { 354 bp->b_qindex = QUEUE_LRU; 355 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 356 } 357 358 /* unlock */ 359 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE); 360 splx(x); 361} 362 363int freebufspace; 364int allocbufspace; 365 366/* 367 * Find a buffer header which is available for use. 368 */ 369struct buf * 370getnewbuf(int slpflag, int slptimeo) 371{ 372 struct buf *bp; 373 int s; 374 s = splbio(); 375start: 376 /* can we constitute a new buffer? */ 377 if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { 378 if( bp->b_qindex != QUEUE_EMPTY) 379 panic("getnewbuf: inconsistent EMPTY queue"); 380 bremfree(bp); 381 goto fillbuf; 382 } 383 384 if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { 385 if( bp->b_qindex != QUEUE_AGE) 386 panic("getnewbuf: inconsistent AGE queue"); 387 bremfree(bp); 388 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { 389 if( bp->b_qindex != QUEUE_LRU) 390 panic("getnewbuf: inconsistent LRU queue"); 391 bremfree(bp); 392 } else { 393 /* wait for a free buffer of any kind */ 394 needsbuffer = 1; 395 tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0); 396 splx(s); 397 return (0); 398 } 399 400 401 /* if we are a delayed write, convert to an async write */ 402 if (bp->b_flags & B_DELWRI) { 403 bp->b_flags |= B_BUSY; 404 bawrite (bp); 405 goto start; 406 } 407 408 if(bp->b_vp) 409 brelvp(bp); 410 411 /* we are not free, nor do we contain interesting data */ 412 if (bp->b_rcred != NOCRED) 413 crfree(bp->b_rcred); 414 if (bp->b_wcred != NOCRED) 415 crfree(bp->b_wcred); 416fillbuf: 417 bp->b_flags = B_BUSY; 418 LIST_REMOVE(bp, b_hash); 419 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 420 splx(s); 421 bp->b_dev = NODEV; 422 bp->b_vp = NULL; 423 bp->b_blkno = bp->b_lblkno = 0; 424 bp->b_iodone = 0; 425 bp->b_error = 0; 426 bp->b_resid = 0; 427 bp->b_bcount = 0; 428 bp->b_wcred = bp->b_rcred = NOCRED; 429 bp->b_dirtyoff = bp->b_dirtyend = 0; 430 bp->b_validoff = bp->b_validend = 0; 431 return (bp); 432} 433 434/* 435 * Check to see if a block is currently memory resident. 436 */ 437struct buf * 438incore(struct vnode *vp, daddr_t blkno) 439{ 440 struct buf *bp; 441 struct bufhashhdr *bh; 442 443 int s = splbio(); 444 445 bh = BUFHASH(vp, blkno); 446 bp = bh->lh_first; 447 448 /* Search hash chain */ 449 while (bp) { 450#ifdef DEBUG 451 if( (bp < buf) || (bp >= buf + nbuf)) { 452 printf("incore: buf out of range: %p, hash: %d\n", 453 bp, bh - bufhashtbl); 454 panic("incore: buf fault"); 455 } 456#endif 457 /* hit */ 458 if (bp->b_lblkno == blkno && bp->b_vp == vp 459 && (bp->b_flags & B_INVAL) == 0) { 460 splx(s); 461 return (bp); 462 } 463 bp = bp->b_hash.le_next; 464 } 465 splx(s); 466 467 return(0); 468} 469 470/* 471 * Get a block given a specified block and offset into a file/device. 472 */ 473struct buf * 474getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 475{ 476 struct buf *bp; 477 int s; 478 struct bufhashhdr *bh; 479 480 s = splbio(); 481loop: 482 if ((bp = incore(vp, blkno))) { 483 if (bp->b_flags & B_BUSY) { 484 bp->b_flags |= B_WANTED; 485 tsleep ((caddr_t)bp, PRIBIO, "getblk", 0); 486 goto loop; 487 } 488 bp->b_flags |= B_BUSY | B_CACHE; 489 bremfree(bp); 490 /* 491 * check for size inconsistancies 492 */ 493 if (bp->b_bcount != size) { 494 printf("getblk: invalid buffer size: %ld\n", bp->b_bcount); 495 bp->b_flags |= B_INVAL; 496 bwrite(bp); 497 goto loop; 498 } 499 } else { 500 if ((bp = getnewbuf(0, 0)) == 0) 501 goto loop; 502 bp->b_blkno = bp->b_lblkno = blkno; 503 bgetvp(vp, bp); 504 LIST_REMOVE(bp, b_hash); 505 bh = BUFHASH(vp, blkno); 506 LIST_INSERT_HEAD(bh, bp, b_hash); 507 allocbuf(bp, size); 508 } 509 splx(s); 510 return (bp); 511} 512 513/* 514 * Get an empty, disassociated buffer of given size. 515 */ 516struct buf * 517geteblk(int size) 518{ 519 struct buf *bp; 520 while ((bp = getnewbuf(0, 0)) == 0) 521 ; 522 allocbuf(bp, size); 523 bp->b_flags |= B_INVAL; 524 return (bp); 525} 526 527/* 528 * Modify the length of a buffer's underlying buffer storage without 529 * destroying information (unless, of course the buffer is shrinking). 530 */ 531void 532allocbuf(struct buf *bp, int size) 533{ 534 535 int newbsize = round_page(size); 536 537 if( newbsize == bp->b_bufsize) { 538 bp->b_bcount = size; 539 return; 540 } else if( newbsize < bp->b_bufsize) { 541 vm_hold_free_pages( 542 (vm_offset_t) bp->b_data + newbsize, 543 (vm_offset_t) bp->b_data + bp->b_bufsize); 544 } else if( newbsize > bp->b_bufsize) { 545 vm_hold_load_pages( 546 (vm_offset_t) bp->b_data + bp->b_bufsize, 547 (vm_offset_t) bp->b_data + newbsize); 548 } 549 550 /* adjust buffer cache's idea of memory allocated to buffer contents */ 551 freebufspace -= newbsize - bp->b_bufsize; 552 allocbufspace += newbsize - bp->b_bufsize; 553 554 bp->b_bufsize = newbsize; 555 bp->b_bcount = size; 556} 557 558/* 559 * Wait for buffer I/O completion, returning error status. 560 */ 561int 562biowait(register struct buf *bp) 563{ 564 int s; 565 566 s = splbio(); 567 while ((bp->b_flags & B_DONE) == 0) 568 tsleep((caddr_t)bp, PRIBIO, "biowait", 0); 569 if((bp->b_flags & B_ERROR) || bp->b_error) { 570 if ((bp->b_flags & B_INVAL) == 0) { 571 bp->b_flags |= B_INVAL; 572 bp->b_dev = NODEV; 573 LIST_REMOVE(bp, b_hash); 574 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 575 } 576 if (!bp->b_error) 577 bp->b_error = EIO; 578 else 579 bp->b_flags |= B_ERROR; 580 splx(s); 581 return (bp->b_error); 582 } else { 583 splx(s); 584 return (0); 585 } 586} 587 588/* 589 * Finish I/O on a buffer, calling an optional function. 590 * This is usually called from interrupt level, so process blocking 591 * is not *a good idea*. 592 */ 593void 594biodone(register struct buf *bp) 595{ 596 int s; 597 s = splbio(); 598 bp->b_flags |= B_DONE; 599 600 if ((bp->b_flags & B_READ) == 0) { 601 vwakeup(bp); 602 } 603 604#ifdef BOUNCE_BUFFERS 605 if (bp->b_flags & B_BOUNCE) 606 vm_bounce_free(bp); 607#endif 608 609 /* call optional completion function if requested */ 610 if (bp->b_flags & B_CALL) { 611 bp->b_flags &= ~B_CALL; 612 (*bp->b_iodone)(bp); 613 splx(s); 614 return; 615 } 616 617/* 618 * For asynchronous completions, release the buffer now. The brelse 619 * checks for B_WANTED and will do the wakeup there if necessary - 620 * so no need to do a wakeup here in the async case. 621 */ 622 623 if (bp->b_flags & B_ASYNC) { 624 brelse(bp); 625 } else { 626 bp->b_flags &= ~B_WANTED; 627 wakeup((caddr_t) bp); 628 } 629 splx(s); 630} 631 632int 633count_lock_queue() 634{ 635 int count; 636 struct buf *bp; 637 638 count = 0; 639 for(bp = bufqueues[QUEUE_LOCKED].tqh_first; 640 bp != NULL; 641 bp = bp->b_freelist.tqe_next) 642 count++; 643 return(count); 644} 645 646int vfs_update_interval = 30; 647 648void 649vfs_update() { 650 (void) spl0(); 651 while(1) { 652 tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update", 653 hz * vfs_update_interval); 654 vfs_update_wakeup = 0; 655 sync(curproc, NULL, NULL); 656 } 657} 658 659#if 0 660#define MAXFREEBP 128 661#define LDFREE_BUSY 1 662#define LDFREE_WANT 2 663int loadfreeing; 664struct buf *freebp[MAXFREEBP]; 665#endif 666/* 667 * these routines are not in the correct place (yet) 668 * also they work *ONLY* for kernel_pmap!!! 669 */ 670void 671vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) { 672 vm_offset_t pg; 673 vm_page_t p; 674 vm_offset_t from = round_page(froma); 675 vm_offset_t to = round_page(toa); 676 677 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 678 679 tryagain: 680#if 0 681/* 682 * don't allow buffer cache to cause VM paging 683 */ 684 if ( cnt.v_free_count < cnt.v_free_min) { 685 if( !loadfreeing ) { 686 int n=0; 687 struct buf *bp; 688 loadfreeing = LDFREE_BUSY; 689 while( (cnt.v_free_count <= cnt.v_free_min) && 690 (n < MAXFREEBP)) { 691 bp = geteblk(0); 692 if( bp) 693 freebp[n++] = bp; 694 else 695 break; 696 } 697 while(--n >= 0) { 698 brelse(freebp[n]); 699 } 700 if( loadfreeing & LDFREE_WANT) 701 wakeup((caddr_t) &loadfreeing); 702 loadfreeing = 0; 703 } else { 704 loadfreeing |= LDFREE_WANT; 705 tsleep(&loadfreeing, PRIBIO, "biofree", 0); 706 } 707 } 708#endif 709 if (cnt.v_free_count <= 710 cnt.v_free_reserved + (toa-froma) / PAGE_SIZE) { 711 VM_WAIT; 712 goto tryagain; 713 } 714 715 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS); 716 if( !p) { 717 VM_WAIT; 718 goto tryagain; 719 } 720 721 vm_page_wire(p); 722 pmap_kenter( pg, VM_PAGE_TO_PHYS(p)); 723 } 724} 725 726void 727vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) 728{ 729 vm_offset_t pg; 730 vm_page_t p; 731 vm_offset_t from = round_page(froma); 732 vm_offset_t to = round_page(toa); 733 734 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 735 p = PHYS_TO_VM_PAGE( pmap_kextract( pg)); 736 pmap_kremove( pg); 737 vm_page_free(p); 738 } 739} 740 741void 742bufstats() 743{ 744} 745 746