vfs_bio.c revision 1817
1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 * 19 * $Id$ 20 */ 21 22#include <sys/param.h> 23#include <sys/systm.h> 24#include <sys/kernel.h> 25#include <sys/proc.h> 26#include <sys/vnode.h> 27#include <sys/buf.h> 28#include <sys/mount.h> 29#include <sys/malloc.h> 30#include <sys/resourcevar.h> 31#include <vm/vm.h> 32#include <vm/vm_pageout.h> 33 34#include <miscfs/specfs/specdev.h> 35 36struct buf *buf; /* buffer header pool */ 37int nbuf; /* number of buffer headers calculated elsewhere */ 38 39extern vm_map_t buffer_map, io_map; 40 41void vm_hold_free_pages(vm_offset_t from, vm_offset_t to); 42void vm_hold_load_pages(vm_offset_t from, vm_offset_t to); 43 44int needsbuffer; 45 46/* 47 * Internal update daemon, process 3 48 * The variable vfs_update_wakeup allows for internal syncs. 49 */ 50int vfs_update_wakeup; 51 52/* 53 * Initialize buffer headers and related structures. 54 */ 55void bufinit() 56{ 57 struct buf *bp; 58 int i; 59 60 TAILQ_INIT(&bswlist); 61 LIST_INIT(&invalhash); 62 63 /* first, make a null hash table */ 64 for(i=0;i<BUFHSZ;i++) 65 LIST_INIT(&bufhashtbl[i]); 66 67 /* next, make a null set of free lists */ 68 for(i=0;i<BUFFER_QUEUES;i++) 69 TAILQ_INIT(&bufqueues[i]); 70 71 /* finally, initialize each buffer header and stick on empty q */ 72 for(i=0;i<nbuf;i++) { 73 bp = &buf[i]; 74 bzero(bp, sizeof *bp); 75 bp->b_flags = B_INVAL; /* we're just an empty header */ 76 bp->b_dev = NODEV; 77 bp->b_vp = NULL; 78 bp->b_rcred = NOCRED; 79 bp->b_wcred = NOCRED; 80 bp->b_qindex = QUEUE_EMPTY; 81 bp->b_vnbufs.le_next = NOLIST; 82 bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE); 83 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 84 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 85 } 86} 87 88/* 89 * remove the buffer from the appropriate free list 90 */ 91void 92bremfree(struct buf *bp) 93{ 94 int s = splbio(); 95 if( bp->b_qindex != QUEUE_NONE) { 96 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 97 bp->b_qindex = QUEUE_NONE; 98 } else { 99 panic("bremfree: removing a buffer when not on a queue"); 100 } 101 splx(s); 102} 103 104/* 105 * Get a buffer with the specified data. Look in the cache first. 106 */ 107int 108bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred, 109 struct buf **bpp) 110{ 111 struct buf *bp; 112 113 bp = getblk (vp, blkno, size, 0, 0); 114 *bpp = bp; 115 116 /* if not found in cache, do some I/O */ 117 if ((bp->b_flags & B_CACHE) == 0) { 118 if (curproc && curproc->p_stats) /* count block I/O */ 119 curproc->p_stats->p_ru.ru_inblock++; 120 bp->b_flags |= B_READ; 121 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 122 if( bp->b_rcred == NOCRED) { 123 if (cred != NOCRED) 124 crhold(cred); 125 bp->b_rcred = cred; 126 } 127 VOP_STRATEGY(bp); 128 return( biowait (bp)); 129 } 130 131 return (0); 132} 133 134/* 135 * Operates like bread, but also starts asynchronous I/O on 136 * read-ahead blocks. 137 */ 138int 139breadn(struct vnode *vp, daddr_t blkno, int size, 140 daddr_t *rablkno, int *rabsize, 141 int cnt, struct ucred *cred, struct buf **bpp) 142{ 143 struct buf *bp, *rabp; 144 int i; 145 int rv = 0, readwait = 0; 146 147 *bpp = bp = getblk (vp, blkno, size, 0, 0); 148 149 /* if not found in cache, do some I/O */ 150 if ((bp->b_flags & B_CACHE) == 0) { 151 if (curproc && curproc->p_stats) /* count block I/O */ 152 curproc->p_stats->p_ru.ru_inblock++; 153 bp->b_flags |= B_READ; 154 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 155 if( bp->b_rcred == NOCRED) { 156 if (cred != NOCRED) 157 crhold(cred); 158 bp->b_rcred = cred; 159 } 160 VOP_STRATEGY(bp); 161 ++readwait; 162 } 163 164 for(i=0;i<cnt;i++, rablkno++, rabsize++) { 165 if( incore(vp, *rablkno)) { 166 continue; 167 } 168 rabp = getblk (vp, *rablkno, *rabsize, 0, 0); 169 170 if ((rabp->b_flags & B_CACHE) == 0) { 171 if (curproc && curproc->p_stats) 172 curproc->p_stats->p_ru.ru_inblock++; 173 rabp->b_flags |= B_READ | B_ASYNC; 174 rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 175 if( rabp->b_rcred == NOCRED) { 176 if (cred != NOCRED) 177 crhold(cred); 178 rabp->b_rcred = cred; 179 } 180 VOP_STRATEGY(rabp); 181 } else { 182 brelse(rabp); 183 } 184 } 185 186 if( readwait) { 187 rv = biowait (bp); 188 } 189 190 return (rv); 191} 192 193/* 194 * Write, release buffer on completion. (Done by iodone 195 * if async.) 196 */ 197int 198bwrite(struct buf *bp) 199{ 200 int oldflags = bp->b_flags; 201 202 if(bp->b_flags & B_INVAL) { 203 brelse(bp); 204 return (0); 205 } 206 207 if(!(bp->b_flags & B_BUSY)) 208 panic("bwrite: buffer is not busy???"); 209 210 bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); 211 bp->b_flags |= B_WRITEINPROG; 212 213 if (oldflags & B_ASYNC) { 214 if (oldflags & B_DELWRI) { 215 reassignbuf(bp, bp->b_vp); 216 } else if( curproc) { 217 ++curproc->p_stats->p_ru.ru_oublock; 218 } 219 } 220 221 bp->b_vp->v_numoutput++; 222 VOP_STRATEGY(bp); 223 224 if( (oldflags & B_ASYNC) == 0) { 225 int rtval = biowait(bp); 226 if (oldflags & B_DELWRI) { 227 reassignbuf(bp, bp->b_vp); 228 } else if( curproc) { 229 ++curproc->p_stats->p_ru.ru_oublock; 230 } 231 brelse(bp); 232 return (rtval); 233 } 234 235 return(0); 236} 237 238int 239vn_bwrite(ap) 240 struct vop_bwrite_args *ap; 241{ 242 return (bwrite(ap->a_bp)); 243} 244 245/* 246 * Delayed write. (Buffer is marked dirty). 247 */ 248void 249bdwrite(struct buf *bp) 250{ 251 252 if((bp->b_flags & B_BUSY) == 0) { 253 panic("bdwrite: buffer is not busy"); 254 } 255 256 if(bp->b_flags & B_INVAL) { 257 brelse(bp); 258 return; 259 } 260 261 if(bp->b_flags & B_TAPE) { 262 bawrite(bp); 263 return; 264 } 265 266 bp->b_flags &= ~B_READ; 267 if( (bp->b_flags & B_DELWRI) == 0) { 268 if( curproc) 269 ++curproc->p_stats->p_ru.ru_oublock; 270 bp->b_flags |= B_DONE|B_DELWRI; 271 reassignbuf(bp, bp->b_vp); 272 } 273 brelse(bp); 274 return; 275} 276 277/* 278 * Asynchronous write. 279 * Start output on a buffer, but do not wait for it to complete. 280 * The buffer is released when the output completes. 281 */ 282void 283bawrite(struct buf *bp) 284{ 285 bp->b_flags |= B_ASYNC; 286 (void) bwrite(bp); 287} 288 289/* 290 * Release a buffer. 291 */ 292void 293brelse(struct buf *bp) 294{ 295 int x; 296 297 /* anyone need a "free" block? */ 298 x=splbio(); 299 if (needsbuffer) { 300 needsbuffer = 0; 301 wakeup((caddr_t)&needsbuffer); 302 } 303 /* anyone need this very block? */ 304 if (bp->b_flags & B_WANTED) { 305 bp->b_flags &= ~(B_WANTED|B_AGE); 306 wakeup((caddr_t)bp); 307 } 308 309 if (bp->b_flags & B_LOCKED) 310 bp->b_flags &= ~B_ERROR; 311 312 if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) || 313 (bp->b_bufsize <= 0)) { 314 bp->b_flags |= B_INVAL; 315 bp->b_flags &= ~(B_DELWRI|B_CACHE); 316 if(bp->b_vp) 317 brelvp(bp); 318 } 319 320 if( bp->b_qindex != QUEUE_NONE) 321 panic("brelse: free buffer onto another queue???"); 322 323 /* enqueue */ 324 /* buffers with junk contents */ 325 if(bp->b_bufsize == 0) { 326 bp->b_qindex = QUEUE_EMPTY; 327 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 328 LIST_REMOVE(bp, b_hash); 329 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 330 bp->b_dev = NODEV; 331 } else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) { 332 bp->b_qindex = QUEUE_AGE; 333 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 334 LIST_REMOVE(bp, b_hash); 335 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 336 bp->b_dev = NODEV; 337 /* buffers that are locked */ 338 } else if(bp->b_flags & B_LOCKED) { 339 bp->b_qindex = QUEUE_LOCKED; 340 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 341 /* buffers with stale but valid contents */ 342 } else if(bp->b_flags & B_AGE) { 343 bp->b_qindex = QUEUE_AGE; 344 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 345 /* buffers with valid and quite potentially reuseable contents */ 346 } else { 347 bp->b_qindex = QUEUE_LRU; 348 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 349 } 350 351 /* unlock */ 352 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE); 353 splx(x); 354} 355 356int freebufspace; 357int allocbufspace; 358 359/* 360 * Find a buffer header which is available for use. 361 */ 362struct buf * 363getnewbuf(int slpflag, int slptimeo) 364{ 365 struct buf *bp; 366 int x; 367 x = splbio(); 368start: 369 /* can we constitute a new buffer? */ 370 if (bp = bufqueues[QUEUE_EMPTY].tqh_first) { 371 if( bp->b_qindex != QUEUE_EMPTY) 372 panic("getnewbuf: inconsistent EMPTY queue"); 373 bremfree(bp); 374 goto fillbuf; 375 } 376 377tryfree: 378 if (bp = bufqueues[QUEUE_AGE].tqh_first) { 379 if( bp->b_qindex != QUEUE_AGE) 380 panic("getnewbuf: inconsistent AGE queue"); 381 bremfree(bp); 382 } else if (bp = bufqueues[QUEUE_LRU].tqh_first) { 383 if( bp->b_qindex != QUEUE_LRU) 384 panic("getnewbuf: inconsistent LRU queue"); 385 bremfree(bp); 386 } else { 387 /* wait for a free buffer of any kind */ 388 needsbuffer = 1; 389 tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0); 390 splx(x); 391 return (0); 392 } 393 394 395 /* if we are a delayed write, convert to an async write */ 396 if (bp->b_flags & B_DELWRI) { 397 bp->b_flags |= B_BUSY; 398 bawrite (bp); 399 goto start; 400 } 401 402 if(bp->b_vp) 403 brelvp(bp); 404 405 /* we are not free, nor do we contain interesting data */ 406 if (bp->b_rcred != NOCRED) 407 crfree(bp->b_rcred); 408 if (bp->b_wcred != NOCRED) 409 crfree(bp->b_wcred); 410fillbuf: 411 bp->b_flags = B_BUSY; 412 LIST_REMOVE(bp, b_hash); 413 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 414 splx(x); 415 bp->b_dev = NODEV; 416 bp->b_vp = NULL; 417 bp->b_blkno = bp->b_lblkno = 0; 418 bp->b_iodone = 0; 419 bp->b_error = 0; 420 bp->b_resid = 0; 421 bp->b_bcount = 0; 422 bp->b_wcred = bp->b_rcred = NOCRED; 423 bp->b_dirtyoff = bp->b_dirtyend = 0; 424 bp->b_validoff = bp->b_validend = 0; 425 return (bp); 426} 427 428/* 429 * Check to see if a block is currently memory resident. 430 */ 431struct buf * 432incore(struct vnode *vp, daddr_t blkno) 433{ 434 struct buf *bp; 435 struct bufhashhdr *bh; 436 437 int s = splbio(); 438 439 bh = BUFHASH(vp, blkno); 440 bp = bh->lh_first; 441 442 /* Search hash chain */ 443 while (bp) { 444 if( (bp < buf) || (bp >= buf + nbuf)) { 445 printf("incore: buf out of range: %lx, hash: %d\n", 446 bp, bh - bufhashtbl); 447 panic("incore: buf fault"); 448 } 449 /* hit */ 450 if (bp->b_lblkno == blkno && bp->b_vp == vp 451 && (bp->b_flags & B_INVAL) == 0) { 452 splx(s); 453 return (bp); 454 } 455 bp = bp->b_hash.le_next; 456 } 457 splx(s); 458 459 return(0); 460} 461 462/* 463 * Get a block given a specified block and offset into a file/device. 464 */ 465struct buf * 466getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 467{ 468 struct buf *bp; 469 int x; 470 struct bufhashhdr *bh; 471 472 x = splbio(); 473loop: 474 if (bp = incore(vp, blkno)) { 475 if (bp->b_flags & B_BUSY) { 476 bp->b_flags |= B_WANTED; 477 tsleep ((caddr_t)bp, PRIBIO, "getblk", 0); 478 goto loop; 479 } 480 bp->b_flags |= B_BUSY | B_CACHE; 481 bremfree(bp); 482 /* 483 * check for size inconsistancies 484 */ 485 if (bp->b_bcount != size) { 486 printf("getblk: invalid buffer size: %d\n", bp->b_bcount); 487 bp->b_flags |= B_INVAL; 488 bwrite(bp); 489 goto loop; 490 } 491 } else { 492 493 if ((bp = getnewbuf(0, 0)) == 0) 494 goto loop; 495 allocbuf(bp, size); 496 /* 497 * have to check again, because of a possible 498 * race condition. 499 */ 500 if (incore( vp, blkno)) { 501 allocbuf(bp, 0); 502 bp->b_flags |= B_INVAL; 503 brelse(bp); 504 goto loop; 505 } 506 bp->b_blkno = bp->b_lblkno = blkno; 507 bgetvp(vp, bp); 508 LIST_REMOVE(bp, b_hash); 509 bh = BUFHASH(vp, blkno); 510 LIST_INSERT_HEAD(bh, bp, b_hash); 511 } 512 splx(x); 513 return (bp); 514} 515 516/* 517 * Get an empty, disassociated buffer of given size. 518 */ 519struct buf * 520geteblk(int size) 521{ 522 struct buf *bp; 523 while ((bp = getnewbuf(0, 0)) == 0) 524 ; 525 allocbuf(bp, size); 526 bp->b_flags |= B_INVAL; 527 return (bp); 528} 529 530/* 531 * Modify the length of a buffer's underlying buffer storage without 532 * destroying information (unless, of course the buffer is shrinking). 533 */ 534void 535allocbuf(struct buf *bp, int size) 536{ 537 538 int newbsize = round_page(size); 539 540 if( newbsize == bp->b_bufsize) { 541 bp->b_bcount = size; 542 return; 543 } else if( newbsize < bp->b_bufsize) { 544 vm_hold_free_pages( 545 (vm_offset_t) bp->b_data + newbsize, 546 (vm_offset_t) bp->b_data + bp->b_bufsize); 547 } else if( newbsize > bp->b_bufsize) { 548 vm_hold_load_pages( 549 (vm_offset_t) bp->b_data + bp->b_bufsize, 550 (vm_offset_t) bp->b_data + newbsize); 551 } 552 553 /* adjust buffer cache's idea of memory allocated to buffer contents */ 554 freebufspace -= newbsize - bp->b_bufsize; 555 allocbufspace += newbsize - bp->b_bufsize; 556 557 bp->b_bufsize = newbsize; 558 bp->b_bcount = size; 559} 560 561/* 562 * Wait for buffer I/O completion, returning error status. 563 */ 564int 565biowait(register struct buf *bp) 566{ 567 int x; 568 569 x = splbio(); 570 while ((bp->b_flags & B_DONE) == 0) 571 tsleep((caddr_t)bp, PRIBIO, "biowait", 0); 572 if((bp->b_flags & B_ERROR) || bp->b_error) { 573 if ((bp->b_flags & B_INVAL) == 0) { 574 bp->b_flags |= B_INVAL; 575 bp->b_dev = NODEV; 576 LIST_REMOVE(bp, b_hash); 577 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 578 } 579 if (!bp->b_error) 580 bp->b_error = EIO; 581 else 582 bp->b_flags |= B_ERROR; 583 splx(x); 584 return (bp->b_error); 585 } else { 586 splx(x); 587 return (0); 588 } 589} 590 591/* 592 * Finish I/O on a buffer, calling an optional function. 593 * This is usually called from interrupt level, so process blocking 594 * is not *a good idea*. 595 */ 596void 597biodone(register struct buf *bp) 598{ 599 int s; 600 s = splbio(); 601 bp->b_flags |= B_DONE; 602 603 if ((bp->b_flags & B_READ) == 0) { 604 vwakeup(bp); 605 } 606 607 /* call optional completion function if requested */ 608 if (bp->b_flags & B_CALL) { 609 bp->b_flags &= ~B_CALL; 610 (*bp->b_iodone)(bp); 611 splx(s); 612 return; 613 } 614 615/* 616 * For asynchronous completions, release the buffer now. The brelse 617 * checks for B_WANTED and will do the wakeup there if necessary - 618 * so no need to do a wakeup here in the async case. 619 */ 620 621 if (bp->b_flags & B_ASYNC) { 622 brelse(bp); 623 } else { 624 bp->b_flags &= ~B_WANTED; 625 wakeup((caddr_t) bp); 626 } 627 splx(s); 628} 629 630int 631count_lock_queue() 632{ 633 int count; 634 struct buf *bp; 635 636 count = 0; 637 for(bp = bufqueues[QUEUE_LOCKED].tqh_first; 638 bp != NULL; 639 bp = bp->b_freelist.tqe_next) 640 count++; 641 return(count); 642} 643 644#ifndef UPDATE_INTERVAL 645int vfs_update_interval = 30; 646#else 647int vfs_update_interval = UPDATE_INTERVAL; 648#endif 649 650void 651vfs_update() { 652 (void) spl0(); 653 while(1) { 654 tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update", 655 hz * vfs_update_interval); 656 vfs_update_wakeup = 0; 657 sync(curproc, NULL, NULL); 658 } 659} 660 661/* 662 * these routines are not in the correct place (yet) 663 * also they work *ONLY* for kernel_pmap!!! 664 */ 665void 666vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) { 667 vm_offset_t pg; 668 vm_page_t p; 669 vm_offset_t from = round_page(froma); 670 vm_offset_t to = round_page(toa); 671 672 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 673 vm_offset_t pa; 674 675 tryagain: 676 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS); 677 if( !p) { 678 VM_WAIT; 679 goto tryagain; 680 } 681 682 vm_page_wire(p); 683 pmap_enter(kernel_pmap, pg, VM_PAGE_TO_PHYS(p), 684 VM_PROT_READ|VM_PROT_WRITE, 1); 685 } 686} 687 688void 689vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) { 690 vm_offset_t pg; 691 vm_page_t p; 692 vm_offset_t from = round_page(froma); 693 vm_offset_t to = round_page(toa); 694 695 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 696 vm_offset_t pa; 697 pa = pmap_kextract(pg); 698 if( !pa) { 699 printf("No pa for va: %x\n", pg); 700 } else { 701 p = PHYS_TO_VM_PAGE( pa); 702 pmap_remove(kernel_pmap, pg, pg + PAGE_SIZE); 703 vm_page_free(p); 704 } 705 } 706} 707 708void 709bufstats() 710{ 711} 712 713